This is an R Markdown Notebook for analysis using data on the DC Bus System (WMATA Metrobus). The data were obtained here:
https://planitmetro.com/2016/11/16/data-download-metrobus-vehicle-location-data/
These analyses coincide with a Shiny dashboard on waitimes found here:
https://mdat.shinyapps.io/DCMetroBus_WaitTimes_20170319/
Load the packages to be used.
# install.packages('rgeos', type='source')
# install.packages('rgdal', type='source')
# install.packages("NbClust")
library("jsonlite") # manipulating JSON files for zip codes
library("sqldf") # sql-based data manipulation
library("tcltk")
library("tidyr") # data manipulation
library("plyr") # data manipulation
library("dplyr") # data manipulation
library("magrittr") # data manipulation (piping data)
library("stringr") # string manipulation
library("data.table") # used in testing data manipulation for speed increases
library("lubridate") # date manipulation
library("geosphere") # calculating Haversine distance
library("ggplot2") # general plotting
library("ggvis") # general plotting
library("rbokeh") # general plotting
library("ggmap") # general plotting of maps
library("rgdal") # used in plotting shapefiles
library("broom") # used in plotting shapefiles
library("maptools") # used in plotting shapefiles
library("rgeos") # used in plotting shapefiles
library("caret") # used in PCA
library("cluster") # used for clustering
library("fpc") # used for clustering
library("dbscan") # used for clustering
library("NbClust") # used for clustering
library("factoextra") # plotting clusters
Get the Bus data.
First let’s check the working directory.
getwd()
Then, actually get the data.
setwd(paste0(BasePath, "DCMetroBus/Bus AVL Oct 2016")
)
for (i in 3:7){
assign(paste0("Oct0", i, "Raw"),
read.delim(paste0("2016100", i, "MetrobusAVL.txt"),
sep = "\t",
header = TRUE,
na.strings = NULL
)
)
message("Oct0", i, "Raw")
str(get(paste0("Oct0", i, "Raw")
)
)
}
'data.frame': 620274 obs. of 17 variables:
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : Factor w/ 266 levels "10A","10B","10E",..: 224 224 224 224 224 224 224 224 224 224 ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 1 1 1 1 1 1 6 6 ...
$ Route_Direction : Factor w/ 11 levels "","ANTICLKW",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Stop_Sequence : int 7 7 6 3 2 8 1 1 2 3 ...
$ Stop_ID : Factor w/ 10552 levels "","1000001","1000003",..: 9682 9682 9683 9641 9640 8136 9668 9668 9796 9795 ...
$ Stop_Desc : Factor w/ 7740 levels "10TH ST + MICHIGAN AVE",..: 1346 1346 7417 7418 1346 2940 2939 2939 6926 6929 ...
$ Event_Type : int 4 5 4 4 4 3 3 4 4 4 ...
$ Event_Description: Factor w/ 3 levels "Serviced Stop ",..: 3 2 3 3 3 1 1 3 3 3 ...
$ Event_Time : Factor w/ 75354 levels "10-3-16 1:00:00 AM",..: 47380 47506 47740 47814 47864 48244 48302 48540 49086 49190 ...
$ Departure_Time : Factor w/ 75396 levels "10-3-16 1:00:00 AM",..: 47406 47554 47766 47840 47890 48270 48536 48566 49112 49216 ...
$ Dwell_Time : int 0 11 0 0 0 0 104 0 0 0 ...
$ Delta_Time : int -177 -27 24 165 25 73 719 0 74 76 ...
$ Odometer_Distance: int 43543 43543 45139 46418 50115 51074 51303 53836 55633 56163 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 199 253 97 276 15 119 100 89 274 104 ...
'data.frame': 623427 obs. of 17 variables:
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : Factor w/ 266 levels "10A","10B","10E",..: 225 225 225 225 225 225 225 225 225 225 ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 1 6 6 6 6 6 6 6 ...
$ Route_Direction : Factor w/ 9 levels "ANTICLKW","CLOCKWIS",..: 4 4 4 4 4 4 4 4 4 4 ...
$ Stop_Sequence : int 1 1 8 1 1 1 1 3 2 5 ...
$ Stop_ID : Factor w/ 10555 levels "","1000001","1000003",..: 9671 9671 8138 8138 8138 8138 8138 9798 9799 9638 ...
$ Stop_Desc : Factor w/ 7717 levels "10TH ST + MICHIGAN AVE",..: 2939 2939 2940 2940 2940 2940 2940 6906 6903 4205 ...
$ Event_Type : int 3 4 3 3 3 5 5 4 5 4 ...
$ Event_Description: Factor w/ 3 levels "Serviced Stop ",..: 1 3 1 1 1 2 2 3 2 3 ...
$ Event_Time : Factor w/ 77713 levels "10-4-16 1:00:00 AM",..: 49126 49240 50858 50908 50976 51116 51172 51714 51842 51940 ...
$ Departure_Time : Factor w/ 77739 levels "10-4-16 1:00:00 AM",..: 49209 49251 50869 50957 50987 51165 51185 51725 51933 51951 ...
$ Dwell_Time : int 79 0 59 19 59 19 1 0 40 0 ...
$ Delta_Time : int 35 36 36 246 244 255 264 159 129 139 ...
$ Odometer_Distance: int 56958 60750 69747 69971 69747 71136 71177 76520 77425 78353 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 58 89 82 76 79 301 274 104 310 2 ...
'data.frame': 630900 obs. of 17 variables:
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : Factor w/ 266 levels "10A","10B","10E",..: 224 224 224 224 224 224 224 224 224 224 ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Route_Direction : Factor w/ 11 levels "ANTICLKW","CLOCKWIS",..: 5 5 5 5 5 5 5 5 5 5 ...
$ Stop_Sequence : int 1 1 3 4 5 4 3 3 7 7 ...
$ Stop_ID : Factor w/ 10543 levels "","1000001","1000003",..: 9659 9659 9632 9633 9624 9633 9632 9632 9673 9673 ...
$ Stop_Desc : Factor w/ 7725 levels "10TH ST + MICHIGAN AVE",..: 2946 2946 7403 7401 7401 7401 7403 7403 1346 1346 ...
$ Event_Type : int 3 5 4 4 3 4 3 3 5 5 ...
$ Event_Description: Factor w/ 3 levels "Serviced Stop ",..: 1 2 3 3 1 3 1 1 2 2 ...
$ Event_Time : Factor w/ 77725 levels "10-5-16 1:00:00 AM",..: 49279 49371 49899 49953 49993 50135 50221 50493 50783 50987 ...
$ Departure_Time : Factor w/ 77716 levels "10-5-16 1:00:00 AM",..: 49257 49353 49877 49931 49997 50113 50421 50489 50767 50999 ...
$ Dwell_Time : int 189 2 0 0 13 0 111 9 3 17 ...
$ Delta_Time : int 4 78 -114 -3 19 93 297 191 382 499 ...
$ Odometer_Distance: int 37932 38703 44242 44327 44645 45927 46733 47077 50461 51916 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 316 104 276 246 229 246 345 207 109 305 ...
'data.frame': 621948 obs. of 17 variables:
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : Factor w/ 265 levels "10A","10B","10E",..: 224 224 224 224 224 224 224 224 224 224 ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Route_Direction : Factor w/ 9 levels "ANTICLKW","CLOCKWIS",..: 4 4 4 4 4 4 4 4 4 4 ...
$ Stop_Sequence : int 1 1 7 6 5 4 3 7 7 7 ...
$ Stop_ID : Factor w/ 10562 levels "","1000001","1000003",..: 9678 9678 9692 9693 9643 9652 9651 9692 9692 9692 ...
$ Stop_Desc : Factor w/ 7723 levels "10TH ST + MICHIGAN AVE",..: 2937 2937 1342 7400 7399 7399 7401 1342 1342 1342 ...
$ Event_Type : int 3 5 4 5 3 4 3 4 5 5 ...
$ Event_Description: Factor w/ 3 levels "Serviced Stop ",..: 1 2 3 2 1 3 1 3 2 2 ...
$ Event_Time : Factor w/ 77758 levels "10-6-16 1:00:00 AM",..: 49294 49384 49982 49998 50058 50186 50270 50518 51002 51064 ...
$ Departure_Time : Factor w/ 77792 levels "10-6-16 1:00:00 AM",..: 49305 49399 49993 50023 50091 50197 50487 50529 51065 51079 ...
$ Dwell_Time : int 148 2 0 7 11 0 103 0 26 2 ...
$ Delta_Time : int -6 64 -87 -93 31 104 303 175 497 504 ...
$ Odometer_Distance: int 37950 38726 44130 44197 44592 45935 46739 51826 51826 51838 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 284 71 199 164 223 246 343 199 306 320 ...
'data.frame': 622894 obs. of 17 variables:
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : Factor w/ 266 levels "10A","10B","10E",..: 224 224 224 224 224 224 224 224 224 224 ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 6 6 6 6 6 6 7 7 ...
$ Route_Direction : Factor w/ 7 levels "ANTICLKW","CLOCKWIS",..: 4 4 4 4 4 4 4 4 4 4 ...
$ Stop_Sequence : int 1 1 2 3 3 5 6 7 1 2 ...
$ Stop_ID : Factor w/ 10556 levels "","1000001","1000003",..: 9672 9672 9800 9799 9799 9639 9640 9641 9641 9642 ...
$ Stop_Desc : Factor w/ 7699 levels "10TH ST + MICHIGAN AVE",..: 2930 2930 6886 6889 6889 4196 4199 6887 6887 4198 ...
$ Event_Type : int 3 4 3 4 5 4 4 3 3 4 ...
$ Event_Description: Factor w/ 3 levels "Serviced Stop ",..: 1 3 1 3 2 3 3 1 1 3 ...
$ Event_Time : Factor w/ 77562 levels "10-7-16 1:00:00 AM",..: 49134 49136 51718 51756 51888 51934 52018 52044 52130 52288 ...
$ Departure_Time : Factor w/ 77649 levels "10-7-16 1:00:00 AM",..: 49193 49195 51779 51815 51953 51993 52077 52103 52189 52347 ...
$ Dwell_Time : int 153 0 1 0 3 0 0 120 120 0 ...
$ Delta_Time : int 57 56 165 270 197 201 181 189 235 288 ...
$ Odometer_Distance: int 37846 42154 56018 56611 57411 58341 59084 59787 59787 60252 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 98 89 201 104 26 2 1 247 247 182 ...
Put the daily data together.
AllDays <- bind_rows(list(Oct03Raw, Oct04Raw, Oct05Raw, Oct06Raw, Oct07Raw),
.id = c("group")
)
# dim(AllDays)
str(AllDays)
'data.frame': 3119443 obs. of 18 variables:
$ group : chr "1" "1" "1" "1" ...
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : chr "S80" "S80" "S80" "S80" ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 1 1 1 1 1 1 6 6 ...
$ Route_Direction : chr "LOOP" "LOOP" "LOOP" "LOOP" ...
$ Stop_Sequence : int 7 7 6 3 2 8 1 1 2 3 ...
$ Stop_ID : chr "5004572" "5004572" "5004573" "5002210" ...
$ Stop_Desc : chr "BEULAH ST + CHARLES ARRINGTON DR" "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" ...
$ Event_Type : int 4 5 4 4 4 3 3 4 4 4 ...
$ Event_Description: Factor w/ 3 levels "Serviced Stop ",..: 3 2 3 3 3 1 1 3 3 3 ...
$ Event_Time : chr "10-3-16 6:06:47 AM" "10-3-16 6:07:50 AM" "10-3-16 6:09:47 AM" "10-3-16 6:10:24 AM" ...
$ Departure_Time : chr "10-3-16 6:06:47 AM" "10-3-16 6:08:01 AM" "10-3-16 6:09:47 AM" "10-3-16 6:10:24 AM" ...
$ Dwell_Time : int 0 11 0 0 0 0 104 0 0 0 ...
$ Delta_Time : int -177 -27 24 165 25 73 719 0 74 76 ...
$ Odometer_Distance: int 43543 43543 45139 46418 50115 51074 51303 53836 55633 56163 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 199 253 97 276 15 119 100 89 274 104 ...
Deleting old data frames.
for (i in 3:7){
rm(list = ls(pattern = paste0("Oct0", i, "Raw")
)
)
message("Deleting Oct0", i, "Raw")
}
Deleting Oct03Raw
Deleting Oct04Raw
Deleting Oct05Raw
Deleting Oct06Raw
Deleting Oct07Raw
Updating variable types.
Then, sorting the data and adding a RowNumber (to be used for identifying rows later in the analyses.)
rm(i)
AllDays$group <- factor(AllDays$group)
AllDays$Route_Direction <- factor(AllDays$Route_Direction)
AllDays$Event_Time <- as.POSIXct(AllDays$Event_Time, format = "%m-%d-%y %I:%M:%S %p")
AllDays$Departure_Time <- as.POSIXct(AllDays$Departure_Time, format = "%m-%d-%y %I:%M:%S %p")
str(AllDays)
'data.frame': 3119443 obs. of 18 variables:
$ group : Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : chr "S80" "S80" "S80" "S80" ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 1 1 1 1 1 1 6 6 ...
$ Route_Direction : Factor w/ 12 levels "","ANTICLKW",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Stop_Sequence : int 7 7 6 3 2 8 1 1 2 3 ...
$ Stop_ID : chr "5004572" "5004572" "5004573" "5002210" ...
$ Stop_Desc : chr "BEULAH ST + CHARLES ARRINGTON DR" "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" ...
$ Event_Type : int 4 5 4 4 4 3 3 4 4 4 ...
$ Event_Description: Factor w/ 3 levels "Serviced Stop ",..: 3 2 3 3 3 1 1 3 3 3 ...
$ Event_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:07:50" ...
$ Departure_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:08:01" ...
$ Dwell_Time : int 0 11 0 0 0 0 104 0 0 0 ...
$ Delta_Time : int -177 -27 24 165 25 73 719 0 74 76 ...
$ Odometer_Distance: int 43543 43543 45139 46418 50115 51074 51303 53836 55633 56163 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 199 253 97 276 15 119 100 89 274 104 ...
AllDays_Sorted <- arrange(AllDays,
Bus_ID,
Event_Time
) %>%
mutate(RowNum_OG = row_number() # this is useful in identify the row later on
)
rm(AllDays)
str(AllDays_Sorted)
'data.frame': 3119443 obs. of 19 variables:
$ group : Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : chr "S80" "S80" "S80" "S80" ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 1 1 1 1 1 1 6 6 ...
$ Route_Direction : Factor w/ 12 levels "","ANTICLKW",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Stop_Sequence : int 7 7 6 3 2 8 1 1 2 3 ...
$ Stop_ID : chr "5004572" "5004572" "5004573" "5002210" ...
$ Stop_Desc : chr "BEULAH ST + CHARLES ARRINGTON DR" "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" ...
$ Event_Type : int 4 5 4 4 4 3 3 4 4 4 ...
$ Event_Description: Factor w/ 3 levels "Serviced Stop ",..: 3 2 3 3 3 1 1 3 3 3 ...
$ Event_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:07:50" ...
$ Departure_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:08:01" ...
$ Dwell_Time : int 0 11 0 0 0 0 104 0 0 0 ...
$ Delta_Time : int -177 -27 24 165 25 73 719 0 74 76 ...
$ Odometer_Distance: int 43543 43543 45139 46418 50115 51074 51303 53836 55633 56163 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 199 253 97 276 15 119 100 89 274 104 ...
$ RowNum_OG : int 1 2 3 4 5 6 7 8 9 10 ...
# View(head(AllDays_Sorted, 100))
Inspecting the values of Stop_ID, and finding that it can take the values “” (blank) and “NULL”.
Creating a table of distinct Stop_Desc values when Stop_ID is “” (blank) or “NULL”.
StopID_New <- filter(AllDays_Sorted,
is.na(Stop_ID) |
Stop_ID == "" |
Stop_ID == "NULL"
) %>%
select(Stop_ID, Stop_Desc) %>%
distinct() %>%
arrange(Stop_ID, Stop_Desc) %>%
mutate(StopID_New = 1:nrow(.)
)
View(StopID_New)
StopID_New
Creating a full updated table by filling in StopID_New for when Stop_ID is “” (blank) or NULL.
AllDays_StopIDNew <- left_join(AllDays_Sorted,
select(StopID_New,
Stop_Desc,
StopID_New
),
by = c("Stop_Desc" = "Stop_Desc")
) %>%
mutate(StopID_Clean = ifelse(is.na(StopID_New),
Stop_ID,
StopID_New
),
StopID_Indicator = factor(ifelse(is.na(StopID_New),
"ID_OK",
"ID_Bad"
)
)
)
rm(StopID_New)
rm(AllDays_Sorted)
str(AllDays_StopIDNew)
'data.frame': 3119443 obs. of 22 variables:
$ group : Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : chr "S80" "S80" "S80" "S80" ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 1 1 1 1 1 1 6 6 ...
$ Route_Direction : Factor w/ 12 levels "","ANTICLKW",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Stop_Sequence : int 7 7 6 3 2 8 1 1 2 3 ...
$ Stop_ID : chr "5004572" "5004572" "5004573" "5002210" ...
$ Stop_Desc : chr "BEULAH ST + CHARLES ARRINGTON DR" "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" ...
$ Event_Type : int 4 5 4 4 4 3 3 4 4 4 ...
$ Event_Description: Factor w/ 3 levels "Serviced Stop ",..: 3 2 3 3 3 1 1 3 3 3 ...
$ Event_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:07:50" ...
$ Departure_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:08:01" ...
$ Dwell_Time : int 0 11 0 0 0 0 104 0 0 0 ...
$ Delta_Time : int -177 -27 24 165 25 73 719 0 74 76 ...
$ Odometer_Distance: int 43543 43543 45139 46418 50115 51074 51303 53836 55633 56163 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 199 253 97 276 15 119 100 89 274 104 ...
$ RowNum_OG : int 1 2 3 4 5 6 7 8 9 10 ...
$ StopID_New : int NA NA NA NA NA NA NA NA NA NA ...
$ StopID_Clean : chr "5004572" "5004572" "5004573" "5002210" ...
$ StopID_Indicator : Factor w/ 2 levels "ID_Bad","ID_OK": 2 2 2 2 2 2 2 2 2 2 ...
# View(tail(AllDays_StopIDNew, 500))
# View(filter(AllDays_StopIDNew,
# Stop_Desc == "METROWAY ANNNOUCEMNT CORR"
# )
# )
Lat Long stats for pulling in Zip codes later.
Pulling in Zip Code data from api.geonames.org.
Join to create one dataset that also includes Zip variables.
AllDays_Zips <- left_join(AllDays_StopIDNew,
LL_StatsZips,
by = c("StopID_Clean" = "StopID_Clean")
) %>%
rename(Stop_State = adminCode1,
Stop_County = adminName2,
Stop_City = placeName,
Stop_Zip = postalCode
)
rm(AllDays_StopIDNew, LL_StatsZips)
str(AllDays_Zips)
'data.frame': 3119443 obs. of 41 variables:
$ group : Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : chr "S80" "S80" "S80" "S80" ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 1 1 1 1 1 1 6 6 ...
$ Route_Direction : Factor w/ 12 levels "","ANTICLKW",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Stop_Sequence : int 7 7 6 3 2 8 1 1 2 3 ...
$ Stop_ID : chr "5004572" "5004572" "5004573" "5002210" ...
$ Stop_Desc : chr "BEULAH ST + CHARLES ARRINGTON DR" "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" ...
$ Event_Type : int 4 5 4 4 4 3 3 4 4 4 ...
$ Event_Description: Factor w/ 3 levels "Serviced Stop ",..: 3 2 3 3 3 1 1 3 3 3 ...
$ Event_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:07:50" ...
$ Departure_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:08:01" ...
$ Dwell_Time : int 0 11 0 0 0 0 104 0 0 0 ...
$ Delta_Time : int -177 -27 24 165 25 73 719 0 74 76 ...
$ Odometer_Distance: int 43543 43543 45139 46418 50115 51074 51303 53836 55633 56163 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 199 253 97 276 15 119 100 89 274 104 ...
$ RowNum_OG : int 1 2 3 4 5 6 7 8 9 10 ...
$ StopID_New : int NA NA NA NA NA NA NA NA NA NA ...
$ StopID_Clean : chr "5004572" "5004572" "5004573" "5002210" ...
$ StopID_Indicator : Factor w/ 2 levels "ID_Bad","ID_OK": 2 2 2 2 2 2 2 2 2 2 ...
$ Lat_Mean : num 38.8 38.8 38.8 38.8 38.8 ...
$ Lat_Med : num 38.8 38.8 38.8 38.8 38.8 ...
$ Lng_Mean : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Lng_Med : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Lat_MeaLessMed : num -0.000794 -0.000794 -0.000185 -0.000173 0.000162 ...
$ Lng_MeaLessMed : num 3.72e-04 3.72e-04 -6.78e-04 1.69e-04 4.11e-05 ...
$ RowNum : int 9715 9715 9716 9674 9673 8168 9701 9701 9829 9828 ...
$ UniqueLatLng : chr "38.767807__-77.155136" "38.767807__-77.155136" "38.769363__-77.157082" "38.769341__-77.155136" ...
$ id : chr "10" "10" "10" "10" ...
$ adminCode2 : chr "059" "059" "059" "059" ...
$ Stop_State : chr "VA" "VA" "VA" "VA" ...
$ Stop_County : chr "Fairfax" "Fairfax" "Fairfax" "Fairfax" ...
$ lng : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ distance : chr "0" "0" "0" "0" ...
$ countryCode : chr "US" "US" "US" "US" ...
$ Stop_Zip : chr "22310" "22310" "22310" "22310" ...
$ adminName1 : chr "Virginia" "Virginia" "Virginia" "Virginia" ...
$ Stop_City : chr "Alexandria" "Alexandria" "Alexandria" "Alexandria" ...
$ lat : num 38.8 38.8 38.8 38.8 38.8 ...
Updating variable types.
AllDays_Zips$Stop_State <- factor(AllDays_Zips$Stop_State)
AllDays_Zips$Stop_County <- factor(AllDays_Zips$Stop_County)
AllDays_Zips$Stop_Zip <- factor(AllDays_Zips$Stop_Zip)
AllDays_Zips$Stop_City <- factor(AllDays_Zips$Stop_City)
AllDays_Zips$distance <- as.numeric(AllDays_Zips$distance)
AllDays_Zips$countryCode <- factor(AllDays_Zips$countryCode)
AllDays_Zips$adminName1 <- factor(AllDays_Zips$adminName1)
str(AllDays_Zips)
'data.frame': 3119443 obs. of 41 variables:
$ group : Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : chr "S80" "S80" "S80" "S80" ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 1 1 1 1 1 1 6 6 ...
$ Route_Direction : Factor w/ 12 levels "","ANTICLKW",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Stop_Sequence : int 7 7 6 3 2 8 1 1 2 3 ...
$ Stop_ID : chr "5004572" "5004572" "5004573" "5002210" ...
$ Stop_Desc : chr "BEULAH ST + CHARLES ARRINGTON DR" "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" ...
$ Event_Type : int 4 5 4 4 4 3 3 4 4 4 ...
$ Event_Description: Factor w/ 3 levels "Serviced Stop ",..: 3 2 3 3 3 1 1 3 3 3 ...
$ Event_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:07:50" ...
$ Departure_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:08:01" ...
$ Dwell_Time : int 0 11 0 0 0 0 104 0 0 0 ...
$ Delta_Time : int -177 -27 24 165 25 73 719 0 74 76 ...
$ Odometer_Distance: int 43543 43543 45139 46418 50115 51074 51303 53836 55633 56163 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 199 253 97 276 15 119 100 89 274 104 ...
$ RowNum_OG : int 1 2 3 4 5 6 7 8 9 10 ...
$ StopID_New : int NA NA NA NA NA NA NA NA NA NA ...
$ StopID_Clean : chr "5004572" "5004572" "5004573" "5002210" ...
$ StopID_Indicator : Factor w/ 2 levels "ID_Bad","ID_OK": 2 2 2 2 2 2 2 2 2 2 ...
$ Lat_Mean : num 38.8 38.8 38.8 38.8 38.8 ...
$ Lat_Med : num 38.8 38.8 38.8 38.8 38.8 ...
$ Lng_Mean : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Lng_Med : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Lat_MeaLessMed : num -0.000794 -0.000794 -0.000185 -0.000173 0.000162 ...
$ Lng_MeaLessMed : num 3.72e-04 3.72e-04 -6.78e-04 1.69e-04 4.11e-05 ...
$ RowNum : int 9715 9715 9716 9674 9673 8168 9701 9701 9829 9828 ...
$ UniqueLatLng : chr "38.767807__-77.155136" "38.767807__-77.155136" "38.769363__-77.157082" "38.769341__-77.155136" ...
$ id : chr "10" "10" "10" "10" ...
$ adminCode2 : chr "059" "059" "059" "059" ...
$ Stop_State : Factor w/ 3 levels "DC","MD","VA": 3 3 3 3 3 3 3 3 3 3 ...
$ Stop_County : Factor w/ 11 levels "Anne Arundel",..: 6 6 6 6 6 6 6 6 6 6 ...
$ lng : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ distance : num 0 0 0 0 0 0 0 0 0 0 ...
$ countryCode : Factor w/ 1 level "US": 1 1 1 1 1 1 1 1 1 1 ...
$ Stop_Zip : Factor w/ 153 levels "20001","20002",..: 150 150 150 150 150 123 123 123 123 123 ...
$ adminName1 : Factor w/ 3 levels "District of Columbia",..: 3 3 3 3 3 3 3 3 3 3 ...
$ Stop_City : Factor w/ 56 levels "Accokeek","Alexandria",..: 2 2 2 2 2 49 49 49 49 49 ...
$ lat : num 38.8 38.8 38.8 38.8 38.8 ...
Feature engineering.
Inspecting incidences of consecutive Stop_IDs. This is done because investigation showed that many conseutive events occurr at the same Stop_ID, but with various Dwell_Times, Odometer_Distances, etc. All of which affect calculations and analyses.
Create data on the runs (consecutive Stop_IDs).
StopID_Runs <- rle(AllDays_Zips$StopID_Clean)
StopID_Runs$ends <- cumsum(StopID_Runs$lengths)
StopID_Runs$starts <- ifelse(is.na(lag(StopID_Runs$ends)
),
1,
lag(StopID_Runs$ends) + 1
)
str(StopID_Runs)
List of 4
$ lengths: int [1:2809529] 2 1 1 1 1 2 1 1 1 1 ...
$ values : chr [1:2809529] "5004572" "5004573" "5002210" "5002209" ...
$ ends : int [1:2809529] 2 3 4 5 6 8 9 10 11 12 ...
$ starts : num [1:2809529] 1 3 4 5 6 7 9 10 11 12 ...
- attr(*, "class")= chr "rle"
# class(StopID_Runs)
#
# StopID_Runs_df <- data.frame(unclass(StopID_Runs))
# str(StopID_Runs_df)
# class(StopID_Runs_df)
# rm(StopID_Runs_df)
Trying to link data on RunsGroups with the original data (AllDays_Sorted). The goal is to select only one record per RunsGroup - that being the record with the longest Dwell_Time.
I attempted this computation using both data.frames (dplyr) and data.tables (data.table). However, with 2,809,062 rows in one dataset and 3,119,443 rows in the other dataset, the current computation time is over 5 days…so I’m trying a different strategy to only select the first record in a run.
# Create a RunsGroup variable for each run
# StopID_Runs_df$RunsGroup <- paste0("g", seq(1:nrow(StopID_Runs_df)
# )
# )
#
# str(StopID_Runs_df)
# head(StopID_Runs_df, 25)
# tail(StopID_Runs_df, 25)
#
# StopID_Runs_df <- StopID_Runs_df %>%
# mutate(RowNum = row_number()
# )
#
# str(StopID_Runs_df)
# head(StopID_Runs_df, 25)
# tail(StopID_Runs_df, 25)
#
#
# # Converting to data.tables for, hopefully, improved performance (speed) in computation
# StopID_Runs_dt <- data.table(StopID_Runs_df)
# setkey(StopID_Runs_dt, RowNum)
# str(StopID_Runs_dt)
#
# AllDays_Sorted_dt <- data.table(AllDays_Sorted)
# setkey(AllDays_Sorted_dt, RowNum_OG)
# str(AllDays_Sorted_dt)
# # rm(AllDays_Sorted_dt)
#
#
# # Actual loop to perform the computations and link to original data (AllDays_Sorted_dt)
# GroupData <- list()
# for(i in 1:nrow(StopID_Runs_dt)
# ) {
# assign(paste0("group_", i),
# StopID_Runs_dt[RowNum == i, RunsGroup]
# )
#
# ##### The code below is the same code as above, but done with dplyr #####
#
# # assign(paste0("group_", i),
# # filter(StopID_Runs_df,
# # RowNum == i
# # ) %>%
# # select(RunsGroup)
# # )
#
# assign(paste0("group_", i, "_start"),
# StopID_Runs_dt[RowNum == i, starts]
# )
#
# assign(paste0("group_", i, "_end"),
# StopID_Runs_dt[RowNum == i, ends]
# )
#
# assign(paste0("group_", i, "_rows"),
# AllDays_Sorted_dt[RowNum_OG >= as.numeric(get(paste0("group_", i, "_start")
# )
# ) &
# RowNum_OG <= as.numeric(get(paste0("group_", i, "_end")
# )
# ),
# RunsGroup := as.character(get(paste0("group_", i)
# )
# )
# ]
#
# ##### The code below is the same as the code above, but done with dplyr #####
#
# # filter(AllDays_Sorted,
# # between(RowNum_OG,
# # as.numeric(get(paste0("group_", i, "_start")
# # )
# # ),
# # as.numeric(get(paste0("group_", i, "_end")
# # )
# # )
# # )
# # ) %>%
# # mutate(RunsGroup = as.character(get(paste0("group_", i)
# # )
# # )
# # )
# )
#
# GroupData[[i]] <- get(paste0("group_", i, "_rows"))
#
# message("Processing Group ", i, " of 2,809,062")
# }
#
#
# GroupData_df <- rbind.fill(GroupData)
# str(GroupData_df)
# head(GroupData_df)
# tail(GroupData_df)
# # rm(GroupData_df)
#
#
# group_1
# group_1_start
# group_1_end
# group_1_rows
# group_2_rows
# group_3_rows
# group_50_rows
# str(group_50_rows)
# group_2809062_rows
# GroupData[[1]]
# GroupData[[50]]
#
#
# ##### Testing Area (Below) #####
# ##### Testing Area (Below) #####
# ##### Testing Area (Below) #####
#
# # head(StopID_Runs$starts, 20)
# # head(AllDays_NewOrder$Stop_ID, 20)
# #
# #
# # dat <- as.data.frame(c(1,1,7,7,7,9,6,8,2,2,2,1,1,1,1,1))
# # colnames(dat)[1] <- "dat"
# # r <- rle(dat$dat)
# # dat$run <- rep(r$lengths, r$lengths)
# # dat$runLag <- lag(dat$run)
# # dat$cond <- rep(r$values, r$lengths)
# # dat
# # View(dat)
When consecutive Stop_ID occurrs, only take the first occurrence. This is done because the computation time to select only the record with the longest Dwell_Time for each run was too long (over 5 days).
This is probably less than ideal with regards to Dwell_Time, but should not make much difference for calculations of travel time, speed, etc.
AllDays_FirstStopID <- AllDays_Zips[StopID_Runs$starts, ]
dim(AllDays_Zips)
[1] 3119443 41
dim(AllDays_FirstStopID)
[1] 2809529 41
nrow(AllDays_Zips) - nrow(AllDays_FirstStopID)
[1] 309914
rm(AllDays_Zips, StopID_Runs)
str(AllDays_FirstStopID)
'data.frame': 2809529 obs. of 41 variables:
$ group : Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : chr "S80" "S80" "S80" "S80" ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 1 1 1 1 6 6 6 6 ...
$ Route_Direction : Factor w/ 12 levels "","ANTICLKW",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Stop_Sequence : int 7 6 3 2 8 1 2 3 4 2 ...
$ Stop_ID : chr "5004572" "5004573" "5002210" "5002209" ...
$ Stop_Desc : chr "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" "BEULAH ST + CHARLES ARRINGTON DR" ...
$ Event_Type : int 4 4 4 4 3 3 4 4 4 4 ...
$ Event_Description: Factor w/ 3 levels "Serviced Stop ",..: 3 3 3 3 1 1 3 3 3 3 ...
$ Event_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Departure_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Dwell_Time : int 0 0 0 0 0 104 0 0 0 0 ...
$ Delta_Time : int -177 24 165 25 73 719 74 76 63 69 ...
$ Odometer_Distance: int 43543 45139 46418 50115 51074 51303 55633 56163 56285 57262 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 199 97 276 15 119 100 274 104 241 274 ...
$ RowNum_OG : int 1 3 4 5 6 7 9 10 11 12 ...
$ StopID_New : int NA NA NA NA NA NA NA NA NA NA ...
$ StopID_Clean : chr "5004572" "5004573" "5002210" "5002209" ...
$ StopID_Indicator : Factor w/ 2 levels "ID_Bad","ID_OK": 2 2 2 2 2 2 2 2 2 2 ...
$ Lat_Mean : num 38.8 38.8 38.8 38.8 38.8 ...
$ Lat_Med : num 38.8 38.8 38.8 38.8 38.8 ...
$ Lng_Mean : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Lng_Med : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Lat_MeaLessMed : num -7.94e-04 -1.85e-04 -1.73e-04 1.62e-04 4.75e-05 ...
$ Lng_MeaLessMed : num 3.72e-04 -6.78e-04 1.69e-04 4.11e-05 -1.52e-04 ...
$ RowNum : int 9715 9716 9674 9673 8168 9701 9829 9828 9667 9829 ...
$ UniqueLatLng : chr "38.767807__-77.155136" "38.769363__-77.157082" "38.769341__-77.155136" "38.766953__-77.155113" ...
$ id : chr "10" "10" "10" "10" ...
$ adminCode2 : chr "059" "059" "059" "059" ...
$ Stop_State : Factor w/ 3 levels "DC","MD","VA": 3 3 3 3 3 3 3 3 3 3 ...
$ Stop_County : Factor w/ 11 levels "Anne Arundel",..: 6 6 6 6 6 6 6 6 6 6 ...
$ lng : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ distance : num 0 0 0 0 0 0 0 0 0 0 ...
$ countryCode : Factor w/ 1 level "US": 1 1 1 1 1 1 1 1 1 1 ...
$ Stop_Zip : Factor w/ 153 levels "20001","20002",..: 150 150 150 150 123 123 123 123 123 123 ...
$ adminName1 : Factor w/ 3 levels "District of Columbia",..: 3 3 3 3 3 3 3 3 3 3 ...
$ Stop_City : Factor w/ 56 levels "Accokeek","Alexandria",..: 2 2 2 2 49 49 49 49 49 49 ...
$ lat : num 38.8 38.8 38.8 38.8 38.8 ...
Feature engineering.
Creating new variables.
AllDays_AddVars <- mutate(AllDays_FirstStopID,
Odometer_Distance_Mi = Odometer_Distance / 5280, #5,280 feet in 1 mile
Dwell_Time2 = as.numeric(Departure_Time - Event_Time),
Event_Time_Yr = as.integer(year(Event_Time)),
Event_Time_Mth = as.integer(month(Event_Time)),
Event_Time_Date = day(Event_Time),
Event_Time_Day = wday(Event_Time, label = TRUE),
Event_Time_Hr = hour(Event_Time),
Event_Time_Min = minute(Event_Time),
Event_Time_HrGroup = factor(ifelse(Event_Time_Hr < 3,
"Group0_2",
ifelse(Event_Time_Hr < 6,
"Group3_5",
ifelse(Event_Time_Hr < 9,
"Group6_8",
ifelse(Event_Time_Hr < 12,
"Group9_11",
ifelse(Event_Time_Hr < 15,
"Group12_14",
ifelse(Event_Time_Hr < 18,
"Group15_17",
ifelse(Event_Time_Hr < 21,
"Group18_20",
ifelse(Event_Time_Hr < 24,
"Group21_23"
)))))))),
levels = c("Group0_2",
"Group3_5",
"Group6_8",
"Group9_11",
"Group12_14",
"Group15_17",
"Group18_20",
"Group21_23"
),
ordered = TRUE
)
)
rm(AllDays_FirstStopID)
str(AllDays_AddVars)
'data.frame': 2809529 obs. of 50 variables:
$ group : Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : chr "S80" "S80" "S80" "S80" ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 1 1 1 1 6 6 6 6 ...
$ Route_Direction : Factor w/ 12 levels "","ANTICLKW",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Stop_Sequence : int 7 6 3 2 8 1 2 3 4 2 ...
$ Stop_ID : chr "5004572" "5004573" "5002210" "5002209" ...
$ Stop_Desc : chr "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" "BEULAH ST + CHARLES ARRINGTON DR" ...
$ Event_Type : int 4 4 4 4 3 3 4 4 4 4 ...
$ Event_Description : Factor w/ 3 levels "Serviced Stop ",..: 3 3 3 3 1 1 3 3 3 3 ...
$ Event_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Departure_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Dwell_Time : int 0 0 0 0 0 104 0 0 0 0 ...
$ Delta_Time : int -177 24 165 25 73 719 74 76 63 69 ...
$ Odometer_Distance : int 43543 45139 46418 50115 51074 51303 55633 56163 56285 57262 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 199 97 276 15 119 100 274 104 241 274 ...
$ RowNum_OG : int 1 3 4 5 6 7 9 10 11 12 ...
$ StopID_New : int NA NA NA NA NA NA NA NA NA NA ...
$ StopID_Clean : chr "5004572" "5004573" "5002210" "5002209" ...
$ StopID_Indicator : Factor w/ 2 levels "ID_Bad","ID_OK": 2 2 2 2 2 2 2 2 2 2 ...
$ Lat_Mean : num 38.8 38.8 38.8 38.8 38.8 ...
$ Lat_Med : num 38.8 38.8 38.8 38.8 38.8 ...
$ Lng_Mean : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Lng_Med : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Lat_MeaLessMed : num -7.94e-04 -1.85e-04 -1.73e-04 1.62e-04 4.75e-05 ...
$ Lng_MeaLessMed : num 3.72e-04 -6.78e-04 1.69e-04 4.11e-05 -1.52e-04 ...
$ RowNum : int 9715 9716 9674 9673 8168 9701 9829 9828 9667 9829 ...
$ UniqueLatLng : chr "38.767807__-77.155136" "38.769363__-77.157082" "38.769341__-77.155136" "38.766953__-77.155113" ...
$ id : chr "10" "10" "10" "10" ...
$ adminCode2 : chr "059" "059" "059" "059" ...
$ Stop_State : Factor w/ 3 levels "DC","MD","VA": 3 3 3 3 3 3 3 3 3 3 ...
$ Stop_County : Factor w/ 11 levels "Anne Arundel",..: 6 6 6 6 6 6 6 6 6 6 ...
$ lng : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ distance : num 0 0 0 0 0 0 0 0 0 0 ...
$ countryCode : Factor w/ 1 level "US": 1 1 1 1 1 1 1 1 1 1 ...
$ Stop_Zip : Factor w/ 153 levels "20001","20002",..: 150 150 150 150 123 123 123 123 123 123 ...
$ adminName1 : Factor w/ 3 levels "District of Columbia",..: 3 3 3 3 3 3 3 3 3 3 ...
$ Stop_City : Factor w/ 56 levels "Accokeek","Alexandria",..: 2 2 2 2 49 49 49 49 49 49 ...
$ lat : num 38.8 38.8 38.8 38.8 38.8 ...
$ Odometer_Distance_Mi: num 8.25 8.55 8.79 9.49 9.67 ...
$ Dwell_Time2 : num 0 0 0 0 0 104 0 0 0 0 ...
$ Event_Time_Yr : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
$ Event_Time_Mth : int 10 10 10 10 10 10 10 10 10 10 ...
$ Event_Time_Date : int 3 3 3 3 3 3 3 3 3 3 ...
$ Event_Time_Day : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tues"<..: 2 2 2 2 2 2 2 2 2 2 ...
$ Event_Time_Hr : int 6 6 6 6 6 6 6 6 6 6 ...
$ Event_Time_Min : int 6 9 10 10 13 14 21 21 23 23 ...
$ Event_Time_HrGroup : Ord.factor w/ 8 levels "Group0_2"<"Group3_5"<..: 3 3 3 3 3 3 3 3 3 3 ...
Function for calculating the distance traveled based on the Haversine formula. Original code from: https://www.r-bloggers.com/great-circle-distance-calculations-in-r/
# Calculates the geodesic distance between two points specified by radian latitude/longitude using the Haversine formula (hf)
# gcd.hf <- function(long1, lat1, long2, lat2) {
# R <- 6371 # Earth mean radius [km]
# delta.long <- (long2 - long1)
# delta.lat <- (lat2 - lat1)
# a <- sin(delta.lat/2)^2 + cos(lat1) * cos(lat2) * sin(delta.long/2)^2
# c <- 2 * asin(min(1,sqrt(a)))
# d = R * c * 0.621371 # 1 km = 0.621371 miles
# return(d) # Distance in miles
# }
Feature engineering.
Creating more variables. Creating a BusEvent row number for future identification purposes. Then, creating various variables to analyze distance traveled and speed.
AllDays_BusDay <- group_by(AllDays_AddVars,
Bus_ID,
Event_Time_Date
) %>%
mutate(BusDay_EventNum = row_number(), # used to identify Bus movements on a particular date
Route_Lag1 = lag(Route), # used in future analyses to identify Route changes
RouteAlt_Lag1 = lag(RouteAlt), # used in future analyses to identify RouteAlt (direction) changes
Odometer_Distance_Lag1 = lag(Odometer_Distance),
Latitude_L1 = lag(Latitude),
Longitude_L1 = lag(Longitude),
# Lat_Radian = Latitude*pi/180,
# Long_Radian = Longitude*pi/180,
# Lat_Radian_L1 = lag(Lat_Radian),
# Long_Radian_L1 = lag(Long_Radian),
# accounting for potential negative distances
TravelDistance_Ft = ifelse(Odometer_Distance > Odometer_Distance_Lag1,
Odometer_Distance - Odometer_Distance_Lag1,
NA
),
TravelDistance_Mi = TravelDistance_Ft / 5280, #5,280 feet in 1 mile
# TravelDistance_Mi2 = gcd.hf(long1 = Long_Radian_L1,
# lat1 = Lat_Radian_L1,
# long2 = Long_Radian,
# lat2 = Lat_Radian
# ),
TravelDistance_Mi_Hvrs =
# ifelse((is.na(Longitude_L1) | is.na(Latitude_L1)
# ),
# NA,
distHaversine(cbind(Longitude_L1, Latitude_L1),
cbind(Longitude, Latitude)
) * 0.000621371, # 0.000621371 miles = 1 meter
# accounting for potential negative times
TravelTime_Sec = as.numeric(ifelse(Event_Time > lag(Departure_Time),
Event_Time - lag(Departure_Time),
NA
)
),
TravelTime_Hr = TravelTime_Sec / 3600, # 3,600 seconds in 1 hour
# accounting for potential negative or zero travel times
SpeedAvg_Mph = ifelse(TravelTime_Hr > 0,
TravelDistance_Mi / TravelTime_Hr,
NA
),
Start_ID = lag(StopID_Clean),
Start_Desc = lag(Stop_Desc),
StartStop_ID = ifelse(is.na(Start_ID),
paste("NULL", StopID_Clean, sep = "--"),
paste(Start_ID, StopID_Clean, sep = "--")
)
) %>%
as.data.frame()
rm(AllDays_AddVars)
str(AllDays_BusDay)
'data.frame': 2809529 obs. of 65 variables:
$ group : Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : chr "S80" "S80" "S80" "S80" ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 1 1 1 1 6 6 6 6 ...
$ Route_Direction : Factor w/ 12 levels "","ANTICLKW",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Stop_Sequence : int 7 6 3 2 8 1 2 3 4 2 ...
$ Stop_ID : chr "5004572" "5004573" "5002210" "5002209" ...
$ Stop_Desc : chr "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" "BEULAH ST + CHARLES ARRINGTON DR" ...
$ Event_Type : int 4 4 4 4 3 3 4 4 4 4 ...
$ Event_Description : Factor w/ 3 levels "Serviced Stop ",..: 3 3 3 3 1 1 3 3 3 3 ...
$ Event_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Departure_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Dwell_Time : int 0 0 0 0 0 104 0 0 0 0 ...
$ Delta_Time : int -177 24 165 25 73 719 74 76 63 69 ...
$ Odometer_Distance : int 43543 45139 46418 50115 51074 51303 55633 56163 56285 57262 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 199 97 276 15 119 100 274 104 241 274 ...
$ RowNum_OG : int 1 3 4 5 6 7 9 10 11 12 ...
$ StopID_New : int NA NA NA NA NA NA NA NA NA NA ...
$ StopID_Clean : chr "5004572" "5004573" "5002210" "5002209" ...
$ StopID_Indicator : Factor w/ 2 levels "ID_Bad","ID_OK": 2 2 2 2 2 2 2 2 2 2 ...
$ Lat_Mean : num 38.8 38.8 38.8 38.8 38.8 ...
$ Lat_Med : num 38.8 38.8 38.8 38.8 38.8 ...
$ Lng_Mean : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Lng_Med : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Lat_MeaLessMed : num -7.94e-04 -1.85e-04 -1.73e-04 1.62e-04 4.75e-05 ...
$ Lng_MeaLessMed : num 3.72e-04 -6.78e-04 1.69e-04 4.11e-05 -1.52e-04 ...
$ RowNum : int 9715 9716 9674 9673 8168 9701 9829 9828 9667 9829 ...
$ UniqueLatLng : chr "38.767807__-77.155136" "38.769363__-77.157082" "38.769341__-77.155136" "38.766953__-77.155113" ...
$ id : chr "10" "10" "10" "10" ...
$ adminCode2 : chr "059" "059" "059" "059" ...
$ Stop_State : Factor w/ 3 levels "DC","MD","VA": 3 3 3 3 3 3 3 3 3 3 ...
$ Stop_County : Factor w/ 11 levels "Anne Arundel",..: 6 6 6 6 6 6 6 6 6 6 ...
$ lng : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ distance : num 0 0 0 0 0 0 0 0 0 0 ...
$ countryCode : Factor w/ 1 level "US": 1 1 1 1 1 1 1 1 1 1 ...
$ Stop_Zip : Factor w/ 153 levels "20001","20002",..: 150 150 150 150 123 123 123 123 123 123 ...
$ adminName1 : Factor w/ 3 levels "District of Columbia",..: 3 3 3 3 3 3 3 3 3 3 ...
$ Stop_City : Factor w/ 56 levels "Accokeek","Alexandria",..: 2 2 2 2 49 49 49 49 49 49 ...
$ lat : num 38.8 38.8 38.8 38.8 38.8 ...
$ Odometer_Distance_Mi : num 8.25 8.55 8.79 9.49 9.67 ...
$ Dwell_Time2 : num 0 0 0 0 0 104 0 0 0 0 ...
$ Event_Time_Yr : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
$ Event_Time_Mth : int 10 10 10 10 10 10 10 10 10 10 ...
$ Event_Time_Date : int 3 3 3 3 3 3 3 3 3 3 ...
$ Event_Time_Day : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tues"<..: 2 2 2 2 2 2 2 2 2 2 ...
$ Event_Time_Hr : int 6 6 6 6 6 6 6 6 6 6 ...
$ Event_Time_Min : int 6 9 10 10 13 14 21 21 23 23 ...
$ Event_Time_HrGroup : Ord.factor w/ 8 levels "Group0_2"<"Group3_5"<..: 3 3 3 3 3 3 3 3 3 3 ...
$ BusDay_EventNum : int 1 2 3 4 5 6 7 8 9 10 ...
$ Route_Lag1 : chr NA "S80" "S80" "S80" ...
$ RouteAlt_Lag1 : Factor w/ 14 levels "1","10","11",..: NA 1 1 1 1 1 1 6 6 6 ...
$ Odometer_Distance_Lag1: int NA 43543 45139 46418 50115 51074 51303 55633 56163 56285 ...
$ Latitude_L1 : num NA 38.8 38.8 38.8 38.8 ...
$ Longitude_L1 : num NA -77.2 -77.2 -77.2 -77.2 ...
$ TravelDistance_Ft : int NA 1596 1279 3697 959 229 4330 530 122 977 ...
$ TravelDistance_Mi : num NA 0.302 0.242 0.7 0.182 ...
$ TravelDistance_Mi_Hvrs: num NA 0.15 0.105 0.165 0.832 ...
$ TravelTime_Sec : num NA 180 37 25 190 29 288 52 76 8 ...
$ TravelTime_Hr : num NA 0.05 0.01028 0.00694 0.05278 ...
$ SpeedAvg_Mph : num NA 6.05 23.57 100.83 3.44 ...
$ Start_ID : chr NA "5004572" "5004573" "5002210" ...
$ Start_Desc : chr NA "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" ...
$ StartStop_ID : chr "NULL--5004572" "5004572--5004573" "5004573--5002210" "5002210--5002209" ...
# summary(AllDays_BusDay)
# View(tail(AllDays_BusDay, 50))
Inspecting for issues with StartStop_ID (where the value is either NA or contains NULL). They ONLY exist when BusDay_EventNum = 1 (which is by design). So everything looks OK.
Stats (quantiles) overall for TravelDistance_Mi.
Quantiles_dt <- AllDays_BusDay %>%
mutate(TD_Mi_q2 = quantile(x = TravelDistance_Mi, probs = 0.02, na.rm = TRUE),
TD_Mi_q98 = quantile(x = TravelDistance_Mi, probs = 0.98, na.rm = TRUE),
TT_Sec_q2 = quantile(x = TravelTime_Sec, probs = 0.02, na.rm = TRUE),
TT_Sec_q98 = quantile(x = TravelTime_Sec, probs = 0.98, na.rm = TRUE),
TT_Hr_q2 = quantile(x = TravelTime_Hr, probs = 0.02, na.rm = TRUE),
TT_Hr_q98 = quantile(x = TravelTime_Hr, probs = 0.98, na.rm = TRUE)
) %>%
data.table()
Stats <- Quantiles_dt %>%
mutate(TD_Mi_Mean = mean(TravelDistance_Mi, na.rm = TRUE),
TD_Mi_Mean_F = mean(TravelDistance_Mi[TD_Mi_q2 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_q98],
na.rm = TRUE
),
TD_Mi_Med = median(TravelDistance_Mi, na.rm = TRUE),
TD_Mi_Med_F = median(TravelDistance_Mi[TD_Mi_q2 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_q98],
na.rm = TRUE
),
TD_Mi_Cnt = sum(!is.na(TravelDistance_Mi)
),
TD_Mi_Cnt_F = sum(!is.na(TravelDistance_Mi[TD_Mi_q2 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_q98]
)
),
TT_Sec_Mean = mean(TravelTime_Sec, na.rm = TRUE),
TT_Sec_Mean_F = mean(TravelTime_Sec[TT_Sec_q2 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_q98],
na.rm = TRUE
),
TT_Sec_Med = median(TravelTime_Sec, na.rm = TRUE),
TT_Sec_Med_F = median(TravelTime_Sec[TT_Sec_q2 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_q98],
na.rm = TRUE
),
TT_Sec_Cnt = sum(!is.na(TravelTime_Sec)
),
TT_Sec_Cnt_F = sum(!is.na(TravelTime_Sec[TT_Sec_q2 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_q98]
)
),
TT_Hr_Mean = mean(TravelTime_Hr, na.rm = TRUE),
TT_Hr_Mean_F = mean(TravelTime_Hr[TT_Hr_q2 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_q98],
na.rm = TRUE
),
TT_Hr_Med = median(TravelTime_Hr, na.rm = TRUE),
TT_Hr_Med_F = median(TravelTime_Hr[TT_Hr_q2 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_q98],
na.rm = TRUE
),
TT_Hr_Cnt = sum(!is.na(TravelTime_Hr)
),
TT_Hr_Cnt_F = sum(!is.na(TravelTime_Hr[TT_Hr_q2 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_q98]
)
)
) %>%
data.frame()
rm(AllDays_BusDay)
rm(Quantiles_dt)
str(Stats)
'data.frame': 2809529 obs. of 89 variables:
$ group : Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : chr "S80" "S80" "S80" "S80" ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 1 1 1 1 6 6 6 6 ...
$ Route_Direction : Factor w/ 12 levels "","ANTICLKW",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Stop_Sequence : int 7 6 3 2 8 1 2 3 4 2 ...
$ Stop_ID : chr "5004572" "5004573" "5002210" "5002209" ...
$ Stop_Desc : chr "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" "BEULAH ST + CHARLES ARRINGTON DR" ...
$ Event_Type : int 4 4 4 4 3 3 4 4 4 4 ...
$ Event_Description : Factor w/ 3 levels "Serviced Stop ",..: 3 3 3 3 1 1 3 3 3 3 ...
$ Event_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Departure_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Dwell_Time : int 0 0 0 0 0 104 0 0 0 0 ...
$ Delta_Time : int -177 24 165 25 73 719 74 76 63 69 ...
$ Odometer_Distance : int 43543 45139 46418 50115 51074 51303 55633 56163 56285 57262 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 199 97 276 15 119 100 274 104 241 274 ...
$ RowNum_OG : int 1 3 4 5 6 7 9 10 11 12 ...
$ StopID_New : int NA NA NA NA NA NA NA NA NA NA ...
$ StopID_Clean : chr "5004572" "5004573" "5002210" "5002209" ...
$ StopID_Indicator : Factor w/ 2 levels "ID_Bad","ID_OK": 2 2 2 2 2 2 2 2 2 2 ...
$ Lat_Mean : num 38.8 38.8 38.8 38.8 38.8 ...
$ Lat_Med : num 38.8 38.8 38.8 38.8 38.8 ...
$ Lng_Mean : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Lng_Med : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Lat_MeaLessMed : num -7.94e-04 -1.85e-04 -1.73e-04 1.62e-04 4.75e-05 ...
$ Lng_MeaLessMed : num 3.72e-04 -6.78e-04 1.69e-04 4.11e-05 -1.52e-04 ...
$ RowNum : int 9715 9716 9674 9673 8168 9701 9829 9828 9667 9829 ...
$ UniqueLatLng : chr "38.767807__-77.155136" "38.769363__-77.157082" "38.769341__-77.155136" "38.766953__-77.155113" ...
$ id : chr "10" "10" "10" "10" ...
$ adminCode2 : chr "059" "059" "059" "059" ...
$ Stop_State : Factor w/ 3 levels "DC","MD","VA": 3 3 3 3 3 3 3 3 3 3 ...
$ Stop_County : Factor w/ 11 levels "Anne Arundel",..: 6 6 6 6 6 6 6 6 6 6 ...
$ lng : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ distance : num 0 0 0 0 0 0 0 0 0 0 ...
$ countryCode : Factor w/ 1 level "US": 1 1 1 1 1 1 1 1 1 1 ...
$ Stop_Zip : Factor w/ 153 levels "20001","20002",..: 150 150 150 150 123 123 123 123 123 123 ...
$ adminName1 : Factor w/ 3 levels "District of Columbia",..: 3 3 3 3 3 3 3 3 3 3 ...
$ Stop_City : Factor w/ 56 levels "Accokeek","Alexandria",..: 2 2 2 2 49 49 49 49 49 49 ...
$ lat : num 38.8 38.8 38.8 38.8 38.8 ...
$ Odometer_Distance_Mi : num 8.25 8.55 8.79 9.49 9.67 ...
$ Dwell_Time2 : num 0 0 0 0 0 104 0 0 0 0 ...
$ Event_Time_Yr : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
$ Event_Time_Mth : int 10 10 10 10 10 10 10 10 10 10 ...
$ Event_Time_Date : int 3 3 3 3 3 3 3 3 3 3 ...
$ Event_Time_Day : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tues"<..: 2 2 2 2 2 2 2 2 2 2 ...
$ Event_Time_Hr : int 6 6 6 6 6 6 6 6 6 6 ...
$ Event_Time_Min : int 6 9 10 10 13 14 21 21 23 23 ...
$ Event_Time_HrGroup : Ord.factor w/ 8 levels "Group0_2"<"Group3_5"<..: 3 3 3 3 3 3 3 3 3 3 ...
$ BusDay_EventNum : int 1 2 3 4 5 6 7 8 9 10 ...
$ Route_Lag1 : chr NA "S80" "S80" "S80" ...
$ RouteAlt_Lag1 : Factor w/ 14 levels "1","10","11",..: NA 1 1 1 1 1 1 6 6 6 ...
$ Odometer_Distance_Lag1: int NA 43543 45139 46418 50115 51074 51303 55633 56163 56285 ...
$ Latitude_L1 : num NA 38.8 38.8 38.8 38.8 ...
$ Longitude_L1 : num NA -77.2 -77.2 -77.2 -77.2 ...
$ TravelDistance_Ft : int NA 1596 1279 3697 959 229 4330 530 122 977 ...
$ TravelDistance_Mi : num NA 0.302 0.242 0.7 0.182 ...
$ TravelDistance_Mi_Hvrs: num NA 0.15 0.105 0.165 0.832 ...
$ TravelTime_Sec : num NA 180 37 25 190 29 288 52 76 8 ...
$ TravelTime_Hr : num NA 0.05 0.01028 0.00694 0.05278 ...
$ SpeedAvg_Mph : num NA 6.05 23.57 100.83 3.44 ...
$ Start_ID : chr NA "5004572" "5004573" "5002210" ...
$ Start_Desc : chr NA "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" ...
$ StartStop_ID : chr "NULL--5004572" "5004572--5004573" "5004573--5002210" "5002210--5002209" ...
$ TD_Mi_q2 : num 0.0521 0.0521 0.0521 0.0521 0.0521 ...
$ TD_Mi_q98 : num 0.959 0.959 0.959 0.959 0.959 ...
$ TT_Sec_q2 : num 10 10 10 10 10 10 10 10 10 10 ...
$ TT_Sec_q98 : num 349 349 349 349 349 349 349 349 349 349 ...
$ TT_Hr_q2 : num 0.00278 0.00278 0.00278 0.00278 0.00278 ...
$ TT_Hr_q98 : num 0.0969 0.0969 0.0969 0.0969 0.0969 ...
$ TD_Mi_Mean : num 0.308 0.308 0.308 0.308 0.308 ...
$ TD_Mi_Mean_F : num 0.232 0.232 0.232 0.232 0.232 ...
$ TD_Mi_Med : num 0.198 0.198 0.198 0.198 0.198 ...
$ TD_Mi_Med_F : num 0.198 0.198 0.198 0.198 0.198 ...
$ TD_Mi_Cnt : int 2486795 2486795 2486795 2486795 2486795 2486795 2486795 2486795 2486795 2486795 ...
$ TD_Mi_Cnt_F : int 2387406 2387406 2387406 2387406 2387406 2387406 2387406 2387406 2387406 2387406 ...
$ TT_Sec_Mean : num 105 105 105 105 105 ...
$ TT_Sec_Mean_F : num 56.6 56.6 56.6 56.6 56.6 ...
$ TT_Sec_Med : num 39 39 39 39 39 39 39 39 39 39 ...
$ TT_Sec_Med_F : num 39 39 39 39 39 39 39 39 39 39 ...
$ TT_Sec_Cnt : int 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 ...
$ TT_Sec_Cnt_F : int 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 ...
$ TT_Hr_Mean : num 0.0291 0.0291 0.0291 0.0291 0.0291 ...
$ TT_Hr_Mean_F : num 0.0157 0.0157 0.0157 0.0157 0.0157 ...
$ TT_Hr_Med : num 0.0108 0.0108 0.0108 0.0108 0.0108 ...
$ TT_Hr_Med_F : num 0.0108 0.0108 0.0108 0.0108 0.0108 ...
$ TT_Hr_Cnt : int 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 ...
$ TT_Hr_Cnt_F : int 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 ...
# View(head(Stats, 50))
Stats for StartStop_ID.
Quantiles_SS_dt <- group_by(Stats,
StartStop_ID
) %>%
mutate(TD_Mi_SS_q5 = quantile(x = TravelDistance_Mi, probs = 0.05, na.rm = TRUE),
TD_Mi_SS_q95 = quantile(x = TravelDistance_Mi, probs = 0.95, na.rm = TRUE),
TT_Sec_SS_q5 = quantile(x = TravelTime_Sec, probs = 0.05, na.rm = TRUE),
TT_Sec_SS_q95 = quantile(x = TravelTime_Sec, probs = 0.95, na.rm = TRUE),
TT_Hr_SS_q5 = quantile(x = TravelTime_Hr, probs = 0.05, na.rm = TRUE),
TT_Hr_SS_q95 = quantile(x = TravelTime_Hr, probs = 0.95, na.rm = TRUE)
) %>%
data.table()
Stats_StSt <- group_by(Quantiles_SS_dt,
StartStop_ID
) %>%
mutate(TD_Mi_SS_Mean = mean(TravelDistance_Mi, na.rm = TRUE),
TD_Mi_SS_Mean_F = mean(TravelDistance_Mi[TD_Mi_SS_q5 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_SS_q95],
na.rm = TRUE
),
TD_Mi_SS_Med = median(TravelDistance_Mi, na.rm = TRUE),
TD_Mi_SS_Med_F = median(TravelDistance_Mi[TD_Mi_SS_q5 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_SS_q95],
na.rm = TRUE
),
TD_Mi_SS_Cnt = sum(!is.na(TravelDistance_Mi)
),
TD_Mi_SS_Cnt_F = sum(!is.na(TravelDistance_Mi[TD_Mi_SS_q5 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_SS_q95]
)
),
TT_Sec_SS_Mean = mean(TravelTime_Sec, na.rm = TRUE),
TT_Sec_SS_Mean_F = mean(TravelTime_Sec[TT_Sec_SS_q5 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_SS_q95],
na.rm = TRUE
),
TT_Sec_SS_Med = median(TravelTime_Sec, na.rm = TRUE),
TT_Sec_SS_Med_F = median(TravelTime_Sec[TT_Sec_SS_q5 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_SS_q95],
na.rm = TRUE
),
TT_Sec_SS_Cnt = sum(!is.na(TravelTime_Sec)),
TT_Sec_SS_Cnt_F = sum(!is.na(TravelTime_Sec[TT_Sec_SS_q5 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_SS_q95]
)
),
TT_Hr_SS_Mean = mean(TravelTime_Hr, na.rm = TRUE),
TT_Hr_SS_Mean_F = mean(TravelTime_Hr[TT_Hr_SS_q5 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_SS_q95],
na.rm = TRUE
),
TT_Hr_SS_Med = median(TravelTime_Hr, na.rm = TRUE),
TT_Hr_SS_Med_F = median(TravelTime_Hr[TT_Hr_SS_q5 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_SS_q95],
na.rm = TRUE
),
TT_Hr_SS_Cnt = sum(!is.na(TravelTime_Hr)),
TT_Hr_SS_Cnt_F = sum(!is.na(TravelTime_Hr[TT_Hr_SS_q5 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_SS_q95]
)
)
) %>%
data.frame()
rm(Stats)
rm(Quantiles_SS_dt)
str(Stats_StSt)
'data.frame': 2809529 obs. of 113 variables:
$ group : Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : chr "S80" "S80" "S80" "S80" ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 1 1 1 1 6 6 6 6 ...
$ Route_Direction : Factor w/ 12 levels "","ANTICLKW",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Stop_Sequence : int 7 6 3 2 8 1 2 3 4 2 ...
$ Stop_ID : chr "5004572" "5004573" "5002210" "5002209" ...
$ Stop_Desc : chr "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" "BEULAH ST + CHARLES ARRINGTON DR" ...
$ Event_Type : int 4 4 4 4 3 3 4 4 4 4 ...
$ Event_Description : Factor w/ 3 levels "Serviced Stop ",..: 3 3 3 3 1 1 3 3 3 3 ...
$ Event_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Departure_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Dwell_Time : int 0 0 0 0 0 104 0 0 0 0 ...
$ Delta_Time : int -177 24 165 25 73 719 74 76 63 69 ...
$ Odometer_Distance : int 43543 45139 46418 50115 51074 51303 55633 56163 56285 57262 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 199 97 276 15 119 100 274 104 241 274 ...
$ RowNum_OG : int 1 3 4 5 6 7 9 10 11 12 ...
$ StopID_New : int NA NA NA NA NA NA NA NA NA NA ...
$ StopID_Clean : chr "5004572" "5004573" "5002210" "5002209" ...
$ StopID_Indicator : Factor w/ 2 levels "ID_Bad","ID_OK": 2 2 2 2 2 2 2 2 2 2 ...
$ Lat_Mean : num 38.8 38.8 38.8 38.8 38.8 ...
$ Lat_Med : num 38.8 38.8 38.8 38.8 38.8 ...
$ Lng_Mean : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Lng_Med : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Lat_MeaLessMed : num -7.94e-04 -1.85e-04 -1.73e-04 1.62e-04 4.75e-05 ...
$ Lng_MeaLessMed : num 3.72e-04 -6.78e-04 1.69e-04 4.11e-05 -1.52e-04 ...
$ RowNum : int 9715 9716 9674 9673 8168 9701 9829 9828 9667 9829 ...
$ UniqueLatLng : chr "38.767807__-77.155136" "38.769363__-77.157082" "38.769341__-77.155136" "38.766953__-77.155113" ...
$ id : chr "10" "10" "10" "10" ...
$ adminCode2 : chr "059" "059" "059" "059" ...
$ Stop_State : Factor w/ 3 levels "DC","MD","VA": 3 3 3 3 3 3 3 3 3 3 ...
$ Stop_County : Factor w/ 11 levels "Anne Arundel",..: 6 6 6 6 6 6 6 6 6 6 ...
$ lng : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ distance : num 0 0 0 0 0 0 0 0 0 0 ...
$ countryCode : Factor w/ 1 level "US": 1 1 1 1 1 1 1 1 1 1 ...
$ Stop_Zip : Factor w/ 153 levels "20001","20002",..: 150 150 150 150 123 123 123 123 123 123 ...
$ adminName1 : Factor w/ 3 levels "District of Columbia",..: 3 3 3 3 3 3 3 3 3 3 ...
$ Stop_City : Factor w/ 56 levels "Accokeek","Alexandria",..: 2 2 2 2 49 49 49 49 49 49 ...
$ lat : num 38.8 38.8 38.8 38.8 38.8 ...
$ Odometer_Distance_Mi : num 8.25 8.55 8.79 9.49 9.67 ...
$ Dwell_Time2 : num 0 0 0 0 0 104 0 0 0 0 ...
$ Event_Time_Yr : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
$ Event_Time_Mth : int 10 10 10 10 10 10 10 10 10 10 ...
$ Event_Time_Date : int 3 3 3 3 3 3 3 3 3 3 ...
$ Event_Time_Day : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tues"<..: 2 2 2 2 2 2 2 2 2 2 ...
$ Event_Time_Hr : int 6 6 6 6 6 6 6 6 6 6 ...
$ Event_Time_Min : int 6 9 10 10 13 14 21 21 23 23 ...
$ Event_Time_HrGroup : Ord.factor w/ 8 levels "Group0_2"<"Group3_5"<..: 3 3 3 3 3 3 3 3 3 3 ...
$ BusDay_EventNum : int 1 2 3 4 5 6 7 8 9 10 ...
$ Route_Lag1 : chr NA "S80" "S80" "S80" ...
$ RouteAlt_Lag1 : Factor w/ 14 levels "1","10","11",..: NA 1 1 1 1 1 1 6 6 6 ...
$ Odometer_Distance_Lag1: int NA 43543 45139 46418 50115 51074 51303 55633 56163 56285 ...
$ Latitude_L1 : num NA 38.8 38.8 38.8 38.8 ...
$ Longitude_L1 : num NA -77.2 -77.2 -77.2 -77.2 ...
$ TravelDistance_Ft : int NA 1596 1279 3697 959 229 4330 530 122 977 ...
$ TravelDistance_Mi : num NA 0.302 0.242 0.7 0.182 ...
$ TravelDistance_Mi_Hvrs: num NA 0.15 0.105 0.165 0.832 ...
$ TravelTime_Sec : num NA 180 37 25 190 29 288 52 76 8 ...
$ TravelTime_Hr : num NA 0.05 0.01028 0.00694 0.05278 ...
$ SpeedAvg_Mph : num NA 6.05 23.57 100.83 3.44 ...
$ Start_ID : chr NA "5004572" "5004573" "5002210" ...
$ Start_Desc : chr NA "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" ...
$ StartStop_ID : chr "NULL--5004572" "5004572--5004573" "5004573--5002210" "5002210--5002209" ...
$ TD_Mi_q2 : num 0.0521 0.0521 0.0521 0.0521 0.0521 ...
$ TD_Mi_q98 : num 0.959 0.959 0.959 0.959 0.959 ...
$ TT_Sec_q2 : num 10 10 10 10 10 10 10 10 10 10 ...
$ TT_Sec_q98 : num 349 349 349 349 349 349 349 349 349 349 ...
$ TT_Hr_q2 : num 0.00278 0.00278 0.00278 0.00278 0.00278 ...
$ TT_Hr_q98 : num 0.0969 0.0969 0.0969 0.0969 0.0969 ...
$ TD_Mi_Mean : num 0.308 0.308 0.308 0.308 0.308 ...
$ TD_Mi_Mean_F : num 0.232 0.232 0.232 0.232 0.232 ...
$ TD_Mi_Med : num 0.198 0.198 0.198 0.198 0.198 ...
$ TD_Mi_Med_F : num 0.198 0.198 0.198 0.198 0.198 ...
$ TD_Mi_Cnt : int 2486795 2486795 2486795 2486795 2486795 2486795 2486795 2486795 2486795 2486795 ...
$ TD_Mi_Cnt_F : int 2387406 2387406 2387406 2387406 2387406 2387406 2387406 2387406 2387406 2387406 ...
$ TT_Sec_Mean : num 105 105 105 105 105 ...
$ TT_Sec_Mean_F : num 56.6 56.6 56.6 56.6 56.6 ...
$ TT_Sec_Med : num 39 39 39 39 39 39 39 39 39 39 ...
$ TT_Sec_Med_F : num 39 39 39 39 39 39 39 39 39 39 ...
$ TT_Sec_Cnt : int 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 ...
$ TT_Sec_Cnt_F : int 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 ...
$ TT_Hr_Mean : num 0.0291 0.0291 0.0291 0.0291 0.0291 ...
$ TT_Hr_Mean_F : num 0.0157 0.0157 0.0157 0.0157 0.0157 ...
$ TT_Hr_Med : num 0.0108 0.0108 0.0108 0.0108 0.0108 ...
$ TT_Hr_Med_F : num 0.0108 0.0108 0.0108 0.0108 0.0108 ...
$ TT_Hr_Cnt : int 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 ...
$ TT_Hr_Cnt_F : int 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 ...
$ TD_Mi_SS_q5 : num NA 0.0252 0.2422 0.7324 0.0794 ...
$ TD_Mi_SS_q95 : num NA 0.626 0.242 1.008 0.176 ...
$ TT_Sec_SS_q5 : num NA 11.9 37 30.5 172.9 ...
$ TT_Sec_SS_q95 : num NA 346.3 37 75.8 189.1 ...
$ TT_Hr_SS_q5 : num NA 0.00331 0.01028 0.00849 0.04803 ...
$ TT_Hr_SS_q95 : num NA 0.0962 0.0103 0.0211 0.0525 ...
$ TD_Mi_SS_Mean : num NaN 0.437 0.242 0.908 0.128 ...
$ TD_Mi_SS_Mean_F : num NaN 0.457 0.242 0.977 NaN ...
$ TD_Mi_SS_Med : num NA 0.512 0.242 0.962 0.128 ...
$ TD_Mi_SS_Med_F : num NA 0.512 0.242 1.008 NA ...
[list output truncated]
# View(head(Stats_StSt, 50))
Stats for StartStop_ID with Event_Time_HrGroup.
Quantiles_SSHG_dt <- group_by(Stats_StSt,
StartStop_ID,
Event_Time_HrGroup
) %>%
mutate(TD_Mi_SSHG_q5 = quantile(x = TravelDistance_Mi, probs = 0.05, na.rm = TRUE),
TD_Mi_SSHG_q95 = quantile(x = TravelDistance_Mi, probs = 0.95, na.rm = TRUE),
TT_Sec_SSHG_q5 = quantile(x = TravelTime_Sec, probs = 0.05, na.rm = TRUE),
TT_Sec_SSHG_q95 = quantile(x = TravelTime_Sec, probs = 0.95, na.rm = TRUE),
TT_Hr_SSHG_q5 = quantile(x = TravelTime_Hr, probs = 0.05, na.rm = TRUE),
TT_Hr_SSHG_q95 = quantile(x = TravelTime_Hr, probs = 0.95, na.rm = TRUE)
) %>%
data.table()
Stats_StSt_HrGrp <- group_by(Quantiles_SSHG_dt,
StartStop_ID,
Event_Time_HrGroup
) %>%
mutate(TD_Mi_SSHG_Mean = mean(TravelDistance_Mi, na.rm = TRUE),
TD_Mi_SSHG_Mean_F = mean(TravelDistance_Mi[TD_Mi_SSHG_q5 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_SSHG_q95],
na.rm = TRUE
),
TD_Mi_SSHG_Med = median(TravelDistance_Mi, na.rm = TRUE),
TD_Mi_SSHG_Med_F = median(TravelDistance_Mi[TD_Mi_SSHG_q5 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_SSHG_q95],
na.rm = TRUE
),
TD_Mi_SSHG_Cnt = sum(!is.na(TravelDistance_Mi)
),
TD_Mi_SSHG_Cnt_F = sum(!is.na(TravelDistance_Mi[TD_Mi_SSHG_q5 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_SSHG_q95]
)
),
TT_Sec_SSHG_Mean = mean(TravelTime_Sec, na.rm = TRUE),
TT_Sec_SSHG_Mean_F = mean(TravelTime_Sec[TT_Sec_SSHG_q5 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_SSHG_q95],
na.rm = TRUE
),
TT_Sec_SSHG_Med = median(TravelTime_Sec, na.rm = TRUE),
TT_Sec_SSHG_Med_F = median(TravelTime_Sec[TT_Sec_SSHG_q5 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_SSHG_q95],
na.rm = TRUE
),
TT_Sec_SSHG_Cnt = sum(!is.na(TravelTime_Sec)),
TT_Sec_SSHG_Cnt_F = sum(!is.na(TravelTime_Sec[TT_Sec_SSHG_q5 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_SSHG_q95]
)
),
TT_Hr_SSHG_Mean = mean(TravelTime_Hr, na.rm = TRUE),
TT_Hr_SSHG_Mean_F = mean(TravelTime_Hr[TT_Hr_SSHG_q5 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_SSHG_q95],
na.rm = TRUE
),
TT_Hr_SSHG_Med = median(TravelTime_Hr, na.rm = TRUE),
TT_Hr_SSHG_Med_F = median(TravelTime_Hr[TT_Hr_SSHG_q5 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_SSHG_q95],
na.rm = TRUE
),
TT_Hr_SSHG_Cnt = sum(!is.na(TravelTime_Hr)),
TT_Hr_SSHG_Cnt_F = sum(!is.na(TravelTime_Hr[TT_Hr_SSHG_q5 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_SSHG_q95]
)
)
) %>%
data.frame()
rm(Stats_StSt)
rm(Quantiles_SSHG_dt)
str(Stats_StSt_HrGrp)
'data.frame': 2809529 obs. of 137 variables:
$ group : Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : chr "S80" "S80" "S80" "S80" ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 1 1 1 1 6 6 6 6 ...
$ Route_Direction : Factor w/ 12 levels "","ANTICLKW",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Stop_Sequence : int 7 6 3 2 8 1 2 3 4 2 ...
$ Stop_ID : chr "5004572" "5004573" "5002210" "5002209" ...
$ Stop_Desc : chr "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" "BEULAH ST + CHARLES ARRINGTON DR" ...
$ Event_Type : int 4 4 4 4 3 3 4 4 4 4 ...
$ Event_Description : Factor w/ 3 levels "Serviced Stop ",..: 3 3 3 3 1 1 3 3 3 3 ...
$ Event_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Departure_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Dwell_Time : int 0 0 0 0 0 104 0 0 0 0 ...
$ Delta_Time : int -177 24 165 25 73 719 74 76 63 69 ...
$ Odometer_Distance : int 43543 45139 46418 50115 51074 51303 55633 56163 56285 57262 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 199 97 276 15 119 100 274 104 241 274 ...
$ RowNum_OG : int 1 3 4 5 6 7 9 10 11 12 ...
$ StopID_New : int NA NA NA NA NA NA NA NA NA NA ...
$ StopID_Clean : chr "5004572" "5004573" "5002210" "5002209" ...
$ StopID_Indicator : Factor w/ 2 levels "ID_Bad","ID_OK": 2 2 2 2 2 2 2 2 2 2 ...
$ Lat_Mean : num 38.8 38.8 38.8 38.8 38.8 ...
$ Lat_Med : num 38.8 38.8 38.8 38.8 38.8 ...
$ Lng_Mean : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Lng_Med : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Lat_MeaLessMed : num -7.94e-04 -1.85e-04 -1.73e-04 1.62e-04 4.75e-05 ...
$ Lng_MeaLessMed : num 3.72e-04 -6.78e-04 1.69e-04 4.11e-05 -1.52e-04 ...
$ RowNum : int 9715 9716 9674 9673 8168 9701 9829 9828 9667 9829 ...
$ UniqueLatLng : chr "38.767807__-77.155136" "38.769363__-77.157082" "38.769341__-77.155136" "38.766953__-77.155113" ...
$ id : chr "10" "10" "10" "10" ...
$ adminCode2 : chr "059" "059" "059" "059" ...
$ Stop_State : Factor w/ 3 levels "DC","MD","VA": 3 3 3 3 3 3 3 3 3 3 ...
$ Stop_County : Factor w/ 11 levels "Anne Arundel",..: 6 6 6 6 6 6 6 6 6 6 ...
$ lng : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ distance : num 0 0 0 0 0 0 0 0 0 0 ...
$ countryCode : Factor w/ 1 level "US": 1 1 1 1 1 1 1 1 1 1 ...
$ Stop_Zip : Factor w/ 153 levels "20001","20002",..: 150 150 150 150 123 123 123 123 123 123 ...
$ adminName1 : Factor w/ 3 levels "District of Columbia",..: 3 3 3 3 3 3 3 3 3 3 ...
$ Stop_City : Factor w/ 56 levels "Accokeek","Alexandria",..: 2 2 2 2 49 49 49 49 49 49 ...
$ lat : num 38.8 38.8 38.8 38.8 38.8 ...
$ Odometer_Distance_Mi : num 8.25 8.55 8.79 9.49 9.67 ...
$ Dwell_Time2 : num 0 0 0 0 0 104 0 0 0 0 ...
$ Event_Time_Yr : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
$ Event_Time_Mth : int 10 10 10 10 10 10 10 10 10 10 ...
$ Event_Time_Date : int 3 3 3 3 3 3 3 3 3 3 ...
$ Event_Time_Day : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tues"<..: 2 2 2 2 2 2 2 2 2 2 ...
$ Event_Time_Hr : int 6 6 6 6 6 6 6 6 6 6 ...
$ Event_Time_Min : int 6 9 10 10 13 14 21 21 23 23 ...
$ Event_Time_HrGroup : Ord.factor w/ 8 levels "Group0_2"<"Group3_5"<..: 3 3 3 3 3 3 3 3 3 3 ...
$ BusDay_EventNum : int 1 2 3 4 5 6 7 8 9 10 ...
$ Route_Lag1 : chr NA "S80" "S80" "S80" ...
$ RouteAlt_Lag1 : Factor w/ 14 levels "1","10","11",..: NA 1 1 1 1 1 1 6 6 6 ...
$ Odometer_Distance_Lag1: int NA 43543 45139 46418 50115 51074 51303 55633 56163 56285 ...
$ Latitude_L1 : num NA 38.8 38.8 38.8 38.8 ...
$ Longitude_L1 : num NA -77.2 -77.2 -77.2 -77.2 ...
$ TravelDistance_Ft : int NA 1596 1279 3697 959 229 4330 530 122 977 ...
$ TravelDistance_Mi : num NA 0.302 0.242 0.7 0.182 ...
$ TravelDistance_Mi_Hvrs: num NA 0.15 0.105 0.165 0.832 ...
$ TravelTime_Sec : num NA 180 37 25 190 29 288 52 76 8 ...
$ TravelTime_Hr : num NA 0.05 0.01028 0.00694 0.05278 ...
$ SpeedAvg_Mph : num NA 6.05 23.57 100.83 3.44 ...
$ Start_ID : chr NA "5004572" "5004573" "5002210" ...
$ Start_Desc : chr NA "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" ...
$ StartStop_ID : chr "NULL--5004572" "5004572--5004573" "5004573--5002210" "5002210--5002209" ...
$ TD_Mi_q2 : num 0.0521 0.0521 0.0521 0.0521 0.0521 ...
$ TD_Mi_q98 : num 0.959 0.959 0.959 0.959 0.959 ...
$ TT_Sec_q2 : num 10 10 10 10 10 10 10 10 10 10 ...
$ TT_Sec_q98 : num 349 349 349 349 349 349 349 349 349 349 ...
$ TT_Hr_q2 : num 0.00278 0.00278 0.00278 0.00278 0.00278 ...
$ TT_Hr_q98 : num 0.0969 0.0969 0.0969 0.0969 0.0969 ...
$ TD_Mi_Mean : num 0.308 0.308 0.308 0.308 0.308 ...
$ TD_Mi_Mean_F : num 0.232 0.232 0.232 0.232 0.232 ...
$ TD_Mi_Med : num 0.198 0.198 0.198 0.198 0.198 ...
$ TD_Mi_Med_F : num 0.198 0.198 0.198 0.198 0.198 ...
$ TD_Mi_Cnt : int 2486795 2486795 2486795 2486795 2486795 2486795 2486795 2486795 2486795 2486795 ...
$ TD_Mi_Cnt_F : int 2387406 2387406 2387406 2387406 2387406 2387406 2387406 2387406 2387406 2387406 ...
$ TT_Sec_Mean : num 105 105 105 105 105 ...
$ TT_Sec_Mean_F : num 56.6 56.6 56.6 56.6 56.6 ...
$ TT_Sec_Med : num 39 39 39 39 39 39 39 39 39 39 ...
$ TT_Sec_Med_F : num 39 39 39 39 39 39 39 39 39 39 ...
$ TT_Sec_Cnt : int 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 ...
$ TT_Sec_Cnt_F : int 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 ...
$ TT_Hr_Mean : num 0.0291 0.0291 0.0291 0.0291 0.0291 ...
$ TT_Hr_Mean_F : num 0.0157 0.0157 0.0157 0.0157 0.0157 ...
$ TT_Hr_Med : num 0.0108 0.0108 0.0108 0.0108 0.0108 ...
$ TT_Hr_Med_F : num 0.0108 0.0108 0.0108 0.0108 0.0108 ...
$ TT_Hr_Cnt : int 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 ...
$ TT_Hr_Cnt_F : int 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 ...
$ TD_Mi_SS_q5 : num NA 0.0252 0.2422 0.7324 0.0794 ...
$ TD_Mi_SS_q95 : num NA 0.626 0.242 1.008 0.176 ...
$ TT_Sec_SS_q5 : num NA 11.9 37 30.5 172.9 ...
$ TT_Sec_SS_q95 : num NA 346.3 37 75.8 189.1 ...
$ TT_Hr_SS_q5 : num NA 0.00331 0.01028 0.00849 0.04803 ...
$ TT_Hr_SS_q95 : num NA 0.0962 0.0103 0.0211 0.0525 ...
$ TD_Mi_SS_Mean : num NaN 0.437 0.242 0.908 0.128 ...
$ TD_Mi_SS_Mean_F : num NaN 0.457 0.242 0.977 NaN ...
$ TD_Mi_SS_Med : num NA 0.512 0.242 0.962 0.128 ...
$ TD_Mi_SS_Med_F : num NA 0.512 0.242 1.008 NA ...
[list output truncated]
# View(head(Stats_StSt_HrGrp, 50))
Feature engineering.
Calculating a variable to know if the RouteAlt changed. Could be useful in helping identifying weirdness in calculated distances and speeds.
# rm(Stats_StSt_HrGrp)
AllDays_DirChange <- Stats_StSt_HrGrp %>% # AllDays_BusDayRoute %>%
mutate(RteChange = ifelse(Route == Route_Lag1,
"Same",
"Change"
),
RteChange2 = factor(ifelse(is.na(RteChange),
"Change",
RteChange
)
),
DirChange = ifelse(RouteAlt == RouteAlt_Lag1,
"Same",
"Change"
),
DirChange2 = factor(ifelse(is.na(DirChange),
"Change",
DirChange
)
)
)
# rm(AllDays_BusDayRoute)
rm(Stats_StSt_HrGrp)
str(AllDays_DirChange)
'data.frame': 2809529 obs. of 141 variables:
$ group : Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : chr "S80" "S80" "S80" "S80" ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 1 1 1 1 6 6 6 6 ...
$ Route_Direction : Factor w/ 12 levels "","ANTICLKW",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Stop_Sequence : int 7 6 3 2 8 1 2 3 4 2 ...
$ Stop_ID : chr "5004572" "5004573" "5002210" "5002209" ...
$ Stop_Desc : chr "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" "BEULAH ST + CHARLES ARRINGTON DR" ...
$ Event_Type : int 4 4 4 4 3 3 4 4 4 4 ...
$ Event_Description : Factor w/ 3 levels "Serviced Stop ",..: 3 3 3 3 1 1 3 3 3 3 ...
$ Event_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Departure_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Dwell_Time : int 0 0 0 0 0 104 0 0 0 0 ...
$ Delta_Time : int -177 24 165 25 73 719 74 76 63 69 ...
$ Odometer_Distance : int 43543 45139 46418 50115 51074 51303 55633 56163 56285 57262 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 199 97 276 15 119 100 274 104 241 274 ...
$ RowNum_OG : int 1 3 4 5 6 7 9 10 11 12 ...
$ StopID_New : int NA NA NA NA NA NA NA NA NA NA ...
$ StopID_Clean : chr "5004572" "5004573" "5002210" "5002209" ...
$ StopID_Indicator : Factor w/ 2 levels "ID_Bad","ID_OK": 2 2 2 2 2 2 2 2 2 2 ...
$ Lat_Mean : num 38.8 38.8 38.8 38.8 38.8 ...
$ Lat_Med : num 38.8 38.8 38.8 38.8 38.8 ...
$ Lng_Mean : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Lng_Med : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Lat_MeaLessMed : num -7.94e-04 -1.85e-04 -1.73e-04 1.62e-04 4.75e-05 ...
$ Lng_MeaLessMed : num 3.72e-04 -6.78e-04 1.69e-04 4.11e-05 -1.52e-04 ...
$ RowNum : int 9715 9716 9674 9673 8168 9701 9829 9828 9667 9829 ...
$ UniqueLatLng : chr "38.767807__-77.155136" "38.769363__-77.157082" "38.769341__-77.155136" "38.766953__-77.155113" ...
$ id : chr "10" "10" "10" "10" ...
$ adminCode2 : chr "059" "059" "059" "059" ...
$ Stop_State : Factor w/ 3 levels "DC","MD","VA": 3 3 3 3 3 3 3 3 3 3 ...
$ Stop_County : Factor w/ 11 levels "Anne Arundel",..: 6 6 6 6 6 6 6 6 6 6 ...
$ lng : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ distance : num 0 0 0 0 0 0 0 0 0 0 ...
$ countryCode : Factor w/ 1 level "US": 1 1 1 1 1 1 1 1 1 1 ...
$ Stop_Zip : Factor w/ 153 levels "20001","20002",..: 150 150 150 150 123 123 123 123 123 123 ...
$ adminName1 : Factor w/ 3 levels "District of Columbia",..: 3 3 3 3 3 3 3 3 3 3 ...
$ Stop_City : Factor w/ 56 levels "Accokeek","Alexandria",..: 2 2 2 2 49 49 49 49 49 49 ...
$ lat : num 38.8 38.8 38.8 38.8 38.8 ...
$ Odometer_Distance_Mi : num 8.25 8.55 8.79 9.49 9.67 ...
$ Dwell_Time2 : num 0 0 0 0 0 104 0 0 0 0 ...
$ Event_Time_Yr : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
$ Event_Time_Mth : int 10 10 10 10 10 10 10 10 10 10 ...
$ Event_Time_Date : int 3 3 3 3 3 3 3 3 3 3 ...
$ Event_Time_Day : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tues"<..: 2 2 2 2 2 2 2 2 2 2 ...
$ Event_Time_Hr : int 6 6 6 6 6 6 6 6 6 6 ...
$ Event_Time_Min : int 6 9 10 10 13 14 21 21 23 23 ...
$ Event_Time_HrGroup : Ord.factor w/ 8 levels "Group0_2"<"Group3_5"<..: 3 3 3 3 3 3 3 3 3 3 ...
$ BusDay_EventNum : int 1 2 3 4 5 6 7 8 9 10 ...
$ Route_Lag1 : chr NA "S80" "S80" "S80" ...
$ RouteAlt_Lag1 : Factor w/ 14 levels "1","10","11",..: NA 1 1 1 1 1 1 6 6 6 ...
$ Odometer_Distance_Lag1: int NA 43543 45139 46418 50115 51074 51303 55633 56163 56285 ...
$ Latitude_L1 : num NA 38.8 38.8 38.8 38.8 ...
$ Longitude_L1 : num NA -77.2 -77.2 -77.2 -77.2 ...
$ TravelDistance_Ft : int NA 1596 1279 3697 959 229 4330 530 122 977 ...
$ TravelDistance_Mi : num NA 0.302 0.242 0.7 0.182 ...
$ TravelDistance_Mi_Hvrs: num NA 0.15 0.105 0.165 0.832 ...
$ TravelTime_Sec : num NA 180 37 25 190 29 288 52 76 8 ...
$ TravelTime_Hr : num NA 0.05 0.01028 0.00694 0.05278 ...
$ SpeedAvg_Mph : num NA 6.05 23.57 100.83 3.44 ...
$ Start_ID : chr NA "5004572" "5004573" "5002210" ...
$ Start_Desc : chr NA "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" ...
$ StartStop_ID : chr "NULL--5004572" "5004572--5004573" "5004573--5002210" "5002210--5002209" ...
$ TD_Mi_q2 : num 0.0521 0.0521 0.0521 0.0521 0.0521 ...
$ TD_Mi_q98 : num 0.959 0.959 0.959 0.959 0.959 ...
$ TT_Sec_q2 : num 10 10 10 10 10 10 10 10 10 10 ...
$ TT_Sec_q98 : num 349 349 349 349 349 349 349 349 349 349 ...
$ TT_Hr_q2 : num 0.00278 0.00278 0.00278 0.00278 0.00278 ...
$ TT_Hr_q98 : num 0.0969 0.0969 0.0969 0.0969 0.0969 ...
$ TD_Mi_Mean : num 0.308 0.308 0.308 0.308 0.308 ...
$ TD_Mi_Mean_F : num 0.232 0.232 0.232 0.232 0.232 ...
$ TD_Mi_Med : num 0.198 0.198 0.198 0.198 0.198 ...
$ TD_Mi_Med_F : num 0.198 0.198 0.198 0.198 0.198 ...
$ TD_Mi_Cnt : int 2486795 2486795 2486795 2486795 2486795 2486795 2486795 2486795 2486795 2486795 ...
$ TD_Mi_Cnt_F : int 2387406 2387406 2387406 2387406 2387406 2387406 2387406 2387406 2387406 2387406 ...
$ TT_Sec_Mean : num 105 105 105 105 105 ...
$ TT_Sec_Mean_F : num 56.6 56.6 56.6 56.6 56.6 ...
$ TT_Sec_Med : num 39 39 39 39 39 39 39 39 39 39 ...
$ TT_Sec_Med_F : num 39 39 39 39 39 39 39 39 39 39 ...
$ TT_Sec_Cnt : int 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 ...
$ TT_Sec_Cnt_F : int 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 ...
$ TT_Hr_Mean : num 0.0291 0.0291 0.0291 0.0291 0.0291 ...
$ TT_Hr_Mean_F : num 0.0157 0.0157 0.0157 0.0157 0.0157 ...
$ TT_Hr_Med : num 0.0108 0.0108 0.0108 0.0108 0.0108 ...
$ TT_Hr_Med_F : num 0.0108 0.0108 0.0108 0.0108 0.0108 ...
$ TT_Hr_Cnt : int 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 ...
$ TT_Hr_Cnt_F : int 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 ...
$ TD_Mi_SS_q5 : num NA 0.0252 0.2422 0.7324 0.0794 ...
$ TD_Mi_SS_q95 : num NA 0.626 0.242 1.008 0.176 ...
$ TT_Sec_SS_q5 : num NA 11.9 37 30.5 172.9 ...
$ TT_Sec_SS_q95 : num NA 346.3 37 75.8 189.1 ...
$ TT_Hr_SS_q5 : num NA 0.00331 0.01028 0.00849 0.04803 ...
$ TT_Hr_SS_q95 : num NA 0.0962 0.0103 0.0211 0.0525 ...
$ TD_Mi_SS_Mean : num NaN 0.437 0.242 0.908 0.128 ...
$ TD_Mi_SS_Mean_F : num NaN 0.457 0.242 0.977 NaN ...
$ TD_Mi_SS_Med : num NA 0.512 0.242 0.962 0.128 ...
$ TD_Mi_SS_Med_F : num NA 0.512 0.242 1.008 NA ...
[list output truncated]
View(filter(AllDays_DirChange,
between(RowNum_OG, 2570060, 2570080)
) %>%
select(-matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
)
)
Re-ordering the variables to ease with comprehension.
AllDays_NewOrder <- select(AllDays_DirChange,
RowNum_OG,
UniqueLatLng,
group,
StartStop_ID,
BusDay_EventNum,
Bus_ID,
Route,
RteChange2,
RouteAlt,
# RouteAlt_Lag1,
DirChange2,
Route_Direction,
Stop_Sequence,
Start_ID,
Start_Desc,
# Stop_ID,
StopID_Clean,
StopID_Indicator,
Stop_Desc,
countryCode,
Stop_State,
Stop_County,
Stop_City,
Stop_Zip,
Event_Type,
Event_Description,
Event_Time_Yr,
Event_Time_Mth,
Event_Time_Date,
Event_Time_Day,
Event_Time_Hr,
Event_Time_HrGroup,
Event_Time_Min,
Event_Time,
Departure_Time,
Dwell_Time,
Dwell_Time2,
Delta_Time,
Latitude,
Longitude,
Heading,
Odometer_Distance,
Odometer_Distance_Lag1,
Odometer_Distance_Mi,
TravelDistance_Ft,
TravelDistance_Mi,
TravelDistance_Mi_Hvrs,
TD_Mi_q2,
TD_Mi_q98,
TD_Mi_SS_q5,
TD_Mi_SS_q95,
TD_Mi_SSHG_q5,
TD_Mi_SSHG_q95,
TD_Mi_Mean,
TD_Mi_Mean_F,
TD_Mi_SS_Mean,
TD_Mi_SS_Mean_F,
TD_Mi_SSHG_Mean,
TD_Mi_SSHG_Mean_F,
TD_Mi_Med,
TD_Mi_Med_F,
TD_Mi_SS_Med,
TD_Mi_SS_Med_F,
TD_Mi_SSHG_Med,
TD_Mi_SSHG_Med_F,
TD_Mi_Cnt,
TD_Mi_Cnt_F,
TD_Mi_SS_Cnt,
TD_Mi_SS_Cnt_F,
TD_Mi_SSHG_Cnt,
TD_Mi_SSHG_Cnt_F,
TravelTime_Sec,
TT_Sec_q2,
TT_Sec_q98,
TT_Sec_SS_q5,
TT_Sec_SS_q95,
TT_Sec_SSHG_q5,
TT_Sec_SSHG_q95,
TT_Sec_Mean,
TT_Sec_Mean_F,
TT_Sec_SS_Mean,
TT_Sec_SS_Mean_F,
TT_Sec_SSHG_Mean,
TT_Sec_SSHG_Mean_F,
TT_Sec_Med,
TT_Sec_Med_F,
TT_Sec_SS_Med,
TT_Sec_SS_Med_F,
TT_Sec_SSHG_Med,
TT_Sec_SSHG_Med_F,
TT_Sec_Cnt,
TT_Sec_Cnt_F,
TT_Sec_SS_Cnt,
TT_Sec_SS_Cnt_F,
TT_Sec_SSHG_Cnt,
TT_Sec_SSHG_Cnt_F,
TravelTime_Hr,
TT_Hr_q2,
TT_Hr_q98,
TT_Hr_SS_q5,
TT_Hr_SS_q95,
TT_Hr_SSHG_q5,
TT_Hr_SSHG_q95,
TT_Hr_Mean,
TT_Hr_Mean_F,
TT_Hr_SS_Mean,
TT_Hr_SS_Mean_F,
TT_Hr_SSHG_Mean,
TT_Hr_SSHG_Mean_F,
TT_Hr_Med,
TT_Hr_Med_F,
TT_Hr_SS_Med,
TT_Hr_SS_Med_F,
TT_Hr_SSHG_Med,
TT_Hr_SSHG_Med_F,
TT_Hr_Cnt,
TT_Hr_Cnt_F,
TT_Hr_SS_Cnt,
TT_Hr_SS_Cnt_F,
TT_Hr_SSHG_Cnt,
TT_Hr_SSHG_Cnt_F,
SpeedAvg_Mph
)
rm(AllDays_DirChange)
str(select(AllDays_NewOrder,
-matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
)
)
'data.frame': 2809529 obs. of 48 variables:
$ RowNum_OG : int 1 3 4 5 6 7 9 10 11 12 ...
$ UniqueLatLng : chr "38.767807__-77.155136" "38.769363__-77.157082" "38.769341__-77.155136" "38.766953__-77.155113" ...
$ group : Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
$ StartStop_ID : chr "NULL--5004572" "5004572--5004573" "5004573--5002210" "5002210--5002209" ...
$ BusDay_EventNum : int 1 2 3 4 5 6 7 8 9 10 ...
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : chr "S80" "S80" "S80" "S80" ...
$ RteChange2 : Factor w/ 2 levels "Change","Same": 1 2 2 2 2 2 2 2 2 2 ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 1 1 1 1 6 6 6 6 ...
$ DirChange2 : Factor w/ 2 levels "Change","Same": 1 2 2 2 2 2 1 2 2 2 ...
$ Route_Direction : Factor w/ 12 levels "","ANTICLKW",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Stop_Sequence : int 7 6 3 2 8 1 2 3 4 2 ...
$ Start_ID : chr NA "5004572" "5004573" "5002210" ...
$ Start_Desc : chr NA "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" ...
$ StopID_Clean : chr "5004572" "5004573" "5002210" "5002209" ...
$ StopID_Indicator : Factor w/ 2 levels "ID_Bad","ID_OK": 2 2 2 2 2 2 2 2 2 2 ...
$ Stop_Desc : chr "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" "BEULAH ST + CHARLES ARRINGTON DR" ...
$ countryCode : Factor w/ 1 level "US": 1 1 1 1 1 1 1 1 1 1 ...
$ Stop_State : Factor w/ 3 levels "DC","MD","VA": 3 3 3 3 3 3 3 3 3 3 ...
$ Stop_County : Factor w/ 11 levels "Anne Arundel",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Stop_City : Factor w/ 56 levels "Accokeek","Alexandria",..: 2 2 2 2 49 49 49 49 49 49 ...
$ Stop_Zip : Factor w/ 153 levels "20001","20002",..: 150 150 150 150 123 123 123 123 123 123 ...
$ Event_Type : int 4 4 4 4 3 3 4 4 4 4 ...
$ Event_Description : Factor w/ 3 levels "Serviced Stop ",..: 3 3 3 3 1 1 3 3 3 3 ...
$ Event_Time_Yr : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
$ Event_Time_Mth : int 10 10 10 10 10 10 10 10 10 10 ...
$ Event_Time_Date : int 3 3 3 3 3 3 3 3 3 3 ...
$ Event_Time_Day : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tues"<..: 2 2 2 2 2 2 2 2 2 2 ...
$ Event_Time_Hr : int 6 6 6 6 6 6 6 6 6 6 ...
$ Event_Time_HrGroup : Ord.factor w/ 8 levels "Group0_2"<"Group3_5"<..: 3 3 3 3 3 3 3 3 3 3 ...
$ Event_Time_Min : int 6 9 10 10 13 14 21 21 23 23 ...
$ Event_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Departure_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Dwell_Time : int 0 0 0 0 0 104 0 0 0 0 ...
$ Dwell_Time2 : num 0 0 0 0 0 104 0 0 0 0 ...
$ Delta_Time : int -177 24 165 25 73 719 74 76 63 69 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 199 97 276 15 119 100 274 104 241 274 ...
$ Odometer_Distance : int 43543 45139 46418 50115 51074 51303 55633 56163 56285 57262 ...
$ Odometer_Distance_Lag1: int NA 43543 45139 46418 50115 51074 51303 55633 56163 56285 ...
$ Odometer_Distance_Mi : num 8.25 8.55 8.79 9.49 9.67 ...
$ TravelDistance_Ft : int NA 1596 1279 3697 959 229 4330 530 122 977 ...
$ TravelDistance_Mi : num NA 0.302 0.242 0.7 0.182 ...
$ TravelDistance_Mi_Hvrs: num NA 0.15 0.105 0.165 0.832 ...
$ TravelTime_Sec : num NA 180 37 25 190 29 288 52 76 8 ...
$ TravelTime_Hr : num NA 0.05 0.01028 0.00694 0.05278 ...
$ SpeedAvg_Mph : num NA 6.05 23.57 100.83 3.44 ...
str(AllDays_NewOrder)
'data.frame': 2809529 obs. of 120 variables:
$ RowNum_OG : int 1 3 4 5 6 7 9 10 11 12 ...
$ UniqueLatLng : chr "38.767807__-77.155136" "38.769363__-77.157082" "38.769341__-77.155136" "38.766953__-77.155113" ...
$ group : Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
$ StartStop_ID : chr "NULL--5004572" "5004572--5004573" "5004573--5002210" "5002210--5002209" ...
$ BusDay_EventNum : int 1 2 3 4 5 6 7 8 9 10 ...
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : chr "S80" "S80" "S80" "S80" ...
$ RteChange2 : Factor w/ 2 levels "Change","Same": 1 2 2 2 2 2 2 2 2 2 ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 1 1 1 1 6 6 6 6 ...
$ DirChange2 : Factor w/ 2 levels "Change","Same": 1 2 2 2 2 2 1 2 2 2 ...
$ Route_Direction : Factor w/ 12 levels "","ANTICLKW",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Stop_Sequence : int 7 6 3 2 8 1 2 3 4 2 ...
$ Start_ID : chr NA "5004572" "5004573" "5002210" ...
$ Start_Desc : chr NA "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" ...
$ StopID_Clean : chr "5004572" "5004573" "5002210" "5002209" ...
$ StopID_Indicator : Factor w/ 2 levels "ID_Bad","ID_OK": 2 2 2 2 2 2 2 2 2 2 ...
$ Stop_Desc : chr "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" "BEULAH ST + CHARLES ARRINGTON DR" ...
$ countryCode : Factor w/ 1 level "US": 1 1 1 1 1 1 1 1 1 1 ...
$ Stop_State : Factor w/ 3 levels "DC","MD","VA": 3 3 3 3 3 3 3 3 3 3 ...
$ Stop_County : Factor w/ 11 levels "Anne Arundel",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Stop_City : Factor w/ 56 levels "Accokeek","Alexandria",..: 2 2 2 2 49 49 49 49 49 49 ...
$ Stop_Zip : Factor w/ 153 levels "20001","20002",..: 150 150 150 150 123 123 123 123 123 123 ...
$ Event_Type : int 4 4 4 4 3 3 4 4 4 4 ...
$ Event_Description : Factor w/ 3 levels "Serviced Stop ",..: 3 3 3 3 1 1 3 3 3 3 ...
$ Event_Time_Yr : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
$ Event_Time_Mth : int 10 10 10 10 10 10 10 10 10 10 ...
$ Event_Time_Date : int 3 3 3 3 3 3 3 3 3 3 ...
$ Event_Time_Day : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tues"<..: 2 2 2 2 2 2 2 2 2 2 ...
$ Event_Time_Hr : int 6 6 6 6 6 6 6 6 6 6 ...
$ Event_Time_HrGroup : Ord.factor w/ 8 levels "Group0_2"<"Group3_5"<..: 3 3 3 3 3 3 3 3 3 3 ...
$ Event_Time_Min : int 6 9 10 10 13 14 21 21 23 23 ...
$ Event_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Departure_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Dwell_Time : int 0 0 0 0 0 104 0 0 0 0 ...
$ Dwell_Time2 : num 0 0 0 0 0 104 0 0 0 0 ...
$ Delta_Time : int -177 24 165 25 73 719 74 76 63 69 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 199 97 276 15 119 100 274 104 241 274 ...
$ Odometer_Distance : int 43543 45139 46418 50115 51074 51303 55633 56163 56285 57262 ...
$ Odometer_Distance_Lag1: int NA 43543 45139 46418 50115 51074 51303 55633 56163 56285 ...
$ Odometer_Distance_Mi : num 8.25 8.55 8.79 9.49 9.67 ...
$ TravelDistance_Ft : int NA 1596 1279 3697 959 229 4330 530 122 977 ...
$ TravelDistance_Mi : num NA 0.302 0.242 0.7 0.182 ...
$ TravelDistance_Mi_Hvrs: num NA 0.15 0.105 0.165 0.832 ...
$ TD_Mi_q2 : num 0.0521 0.0521 0.0521 0.0521 0.0521 ...
$ TD_Mi_q98 : num 0.959 0.959 0.959 0.959 0.959 ...
$ TD_Mi_SS_q5 : num NA 0.0252 0.2422 0.7324 0.0794 ...
$ TD_Mi_SS_q95 : num NA 0.626 0.242 1.008 0.176 ...
$ TD_Mi_SSHG_q5 : num NA 0.0996 0.2422 0.7002 0.1816 ...
$ TD_Mi_SSHG_q95 : num NA 0.627 0.242 0.7 0.182 ...
$ TD_Mi_Mean : num 0.308 0.308 0.308 0.308 0.308 ...
$ TD_Mi_Mean_F : num 0.232 0.232 0.232 0.232 0.232 ...
$ TD_Mi_SS_Mean : num NaN 0.437 0.242 0.908 0.128 ...
$ TD_Mi_SS_Mean_F : num NaN 0.457 0.242 0.977 NaN ...
$ TD_Mi_SSHG_Mean : num NaN 0.442 0.242 0.7 0.182 ...
$ TD_Mi_SSHG_Mean_F : num NaN 0.491 0.242 0.7 0.182 ...
$ TD_Mi_Med : num 0.198 0.198 0.198 0.198 0.198 ...
$ TD_Mi_Med_F : num 0.198 0.198 0.198 0.198 0.198 ...
$ TD_Mi_SS_Med : num NA 0.512 0.242 0.962 0.128 ...
$ TD_Mi_SS_Med_F : num NA 0.512 0.242 1.008 NA ...
$ TD_Mi_SSHG_Med : num NA 0.512 0.242 0.7 0.182 ...
$ TD_Mi_SSHG_Med_F : num NA 0.512 0.242 0.7 0.182 ...
$ TD_Mi_Cnt : int 2486795 2486795 2486795 2486795 2486795 2486795 2486795 2486795 2486795 2486795 ...
$ TD_Mi_Cnt_F : int 2387406 2387406 2387406 2387406 2387406 2387406 2387406 2387406 2387406 2387406 ...
$ TD_Mi_SS_Cnt : int 0 14 1 4 2 87 22 118 91 11 ...
$ TD_Mi_SS_Cnt_F : int 0 12 1 3 0 77 18 106 81 9 ...
$ TD_Mi_SSHG_Cnt : int 0 7 1 1 1 23 6 29 28 3 ...
$ TD_Mi_SSHG_Cnt_F : int 0 5 1 1 1 19 4 25 24 1 ...
$ TravelTime_Sec : num NA 180 37 25 190 29 288 52 76 8 ...
$ TT_Sec_q2 : num 10 10 10 10 10 10 10 10 10 10 ...
$ TT_Sec_q98 : num 349 349 349 349 349 349 349 349 349 349 ...
$ TT_Sec_SS_q5 : num NA 11.9 37 30.5 172.9 ...
$ TT_Sec_SS_q95 : num NA 346.3 37 75.8 189.1 ...
$ TT_Sec_SSHG_q5 : num NA 59.6 37 25 190 11.6 236 51.5 55 8.8 ...
$ TT_Sec_SSHG_q95 : num NA 276 37 25 190 ...
$ TT_Sec_Mean : num 105 105 105 105 105 ...
$ TT_Sec_Mean_F : num 56.6 56.6 56.6 56.6 56.6 ...
$ TT_Sec_SS_Mean : num NaN 215.8 37 58.2 181 ...
$ TT_Sec_SS_Mean_F : num NaN 218.9 37 65.5 NaN ...
$ TT_Sec_SSHG_Mean : num NaN 202 37 25 190 ...
$ TT_Sec_SSHG_Mean_F : num NaN 226 37 25 190 ...
$ TT_Sec_Med : num 39 39 39 39 39 39 39 39 39 39 ...
$ TT_Sec_Med_F : num 39 39 39 39 39 39 39 39 39 39 ...
$ TT_Sec_SS_Med : num NA 223.5 37 65.5 181 ...
$ TT_Sec_SS_Med_F : num NA 223.5 37 65.5 NA ...
$ TT_Sec_SSHG_Med : num NA 219 37 25 190 134 286 60 65 16 ...
$ TT_Sec_SSHG_Med_F : num NA 219 37 25 190 134 286 60 65 16 ...
$ TT_Sec_Cnt : int 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 ...
$ TT_Sec_Cnt_F : int 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 ...
$ TT_Sec_SS_Cnt : int 0 14 1 4 2 173 22 141 141 11 ...
$ TT_Sec_SS_Cnt_F : int 0 12 1 2 0 156 18 127 128 9 ...
$ TT_Sec_SSHG_Cnt : int 0 7 1 1 1 35 6 36 35 3 ...
$ TT_Sec_SSHG_Cnt_F : int 0 5 1 1 1 31 4 32 32 1 ...
$ TravelTime_Hr : num NA 0.05 0.01028 0.00694 0.05278 ...
$ TT_Hr_q2 : num 0.00278 0.00278 0.00278 0.00278 0.00278 ...
$ TT_Hr_q98 : num 0.0969 0.0969 0.0969 0.0969 0.0969 ...
$ TT_Hr_SS_q5 : num NA 0.00331 0.01028 0.00849 0.04803 ...
$ TT_Hr_SS_q95 : num NA 0.0962 0.0103 0.0211 0.0525 ...
[list output truncated]
# View(head(AllDays_NewOrder, 500))
# View(tail(AllDays_NewOrder, 500))
Summarizing the data to help spot anomolies.
summary(AllDays_NewOrder)
RowNum_OG UniqueLatLng group StartStop_ID
Min. : 1 Length:2809529 1:559521 Length:2809529
1st Qu.: 784722 Class :character 2:561389 Class :character
Median :1563300 Mode :character 3:567794 Mode :character
Mean :1562504 4:559180
3rd Qu.:2337981 5:561645
Max. :3119443
BusDay_EventNum Bus_ID Route RteChange2
Min. : 1.0 Min. : 11 Length:2809529 Change: 23772
1st Qu.: 113.0 1st Qu.:2922 Class :character Same :2785757
Median : 248.0 Median :6195 Mode :character
Mean : 290.5 Mean :5382
3rd Qu.: 428.0 3rd Qu.:7104
Max. :1344.0 Max. :8105
RouteAlt DirChange2 Route_Direction Stop_Sequence
2 :1128810 Change: 65126 SOUTH :739235 Min. : 1.00
1 :1065425 Same :2744403 NORTH :735203 1st Qu.: 12.00
3 : 260372 WEST :649706 Median : 24.00
4 : 130801 EAST :628074 Mean : 26.83
5 : 75039 LOOP : 35611 3rd Qu.: 39.00
6 : 56408 CLOCKWIS: 10671 Max. :104.00
(Other): 92674 (Other) : 11029
Start_ID Start_Desc StopID_Clean StopID_Indicator
Length:2809529 Length:2809529 Length:2809529 ID_Bad: 18948
Class :character Class :character Class :character ID_OK :2790581
Mode :character Mode :character Mode :character
Stop_Desc countryCode Stop_State Stop_County
Length:2809529 US :2808431 DC :1297006 District of Columbia:1297006
Class :character NA's: 1098 MD : 982401 Prince George's : 589193
Mode :character VA : 529024 Montgomery : 391422
NA's: 1098 Fairfax : 204558
Arlington : 198618
(Other) : 127634
NA's : 1098
Stop_City Stop_Zip Event_Type
Washington :1296626 20020 : 156333 Min. :3.0
Silver Spring: 227570 20032 : 117215 1st Qu.:3.0
Arlington : 198360 20019 : 116560 Median :4.0
Hyattsville : 166930 20011 : 114518 Mean :3.6
Alexandria : 103776 20002 : 101086 3rd Qu.:4.0
(Other) : 815169 (Other):2202719 Max. :5.0
NA's : 1098 NA's : 1098
Event_Description Event_Time_Yr
Serviced Stop :1127366 Min. :2016
Unknown Stop : 2579 1st Qu.:2016
UnServiced Stop :1679584 Median :2016
Mean :2016
3rd Qu.:2016
Max. :2016
Event_Time_Mth Event_Time_Date Event_Time_Day Event_Time_Hr Event_Time_HrGroup
Min. :10 Min. :3.000 Sun : 0 Min. : 0.00 Group6_8 :611612
1st Qu.:10 1st Qu.:4.000 Mon :559521 1st Qu.: 8.00 Group15_17:560103
Median :10 Median :5.000 Tues :561389 Median :13.00 Group18_20:461056
Mean :10 Mean :5.001 Wed :567794 Mean :12.97 Group9_11 :396514
3rd Qu.:10 3rd Qu.:6.000 Thurs:559180 3rd Qu.:18.00 Group12_14:353603
Max. :10 Max. :7.000 Fri :561645 Max. :23.00 Group21_23:244522
Sat : 0 (Other) :182119
Event_Time_Min Event_Time Departure_Time
Min. : 0.00 Min. :2016-10-03 00:00:00 Min. :2016-10-03 00:00:00
1st Qu.:14.00 1st Qu.:2016-10-04 08:36:14 1st Qu.:2016-10-04 08:36:20
Median :29.00 Median :2016-10-05 13:49:29 Median :2016-10-05 13:49:38
Mean :29.43 Mean :2016-10-05 13:29:21 Mean :2016-10-05 13:29:28
3rd Qu.:44.00 3rd Qu.:2016-10-06 17:58:06 3rd Qu.:2016-10-06 17:58:13
Max. :59.00 Max. :2016-10-07 23:59:59 Max. :2016-10-08 00:12:31
Dwell_Time Dwell_Time2 Delta_Time Latitude
Min. : 0.00 Min. : 0.000 Min. :-5606.0 Min. : 0.00
1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 14.0 1st Qu.:38.86
Median : 0.00 Median : 0.000 Median : 157.0 Median :38.90
Mean : 12.56 Mean : 6.359 Mean : 268.8 Mean :38.91
3rd Qu.: 5.00 3rd Qu.: 4.000 3rd Qu.: 396.0 3rd Qu.:38.96
Max. :6205.00 Max. :6205.000 Max. : 9426.0 Max. :39.19
Longitude Heading Odometer_Distance Odometer_Distance_Lag1
Min. :-77.45 Min. : 0.0 Min. : 0 Min. : 0
1st Qu.:-77.07 1st Qu.: 89.0 1st Qu.: 177595 1st Qu.: 177326
Median :-77.02 Median :180.0 Median : 377510 Median : 376934
Mean :-77.02 Mean :176.9 Mean : 426254 Mean : 425713
3rd Qu.:-76.97 3rd Qu.:269.0 3rd Qu.: 623667 3rd Qu.: 622879
Max. : 0.00 Max. :360.0 Max. :11108034 Max. :10853226
NA's :6528
Odometer_Distance_Mi TravelDistance_Ft TravelDistance_Mi TravelDistance_Mi_Hvrs
Min. : 0.00 Min. : 1 Min. : 0.0 Min. : 0.000
1st Qu.: 33.64 1st Qu.: 699 1st Qu.: 0.1 1st Qu.: 0.106
Median : 71.50 Median : 1044 Median : 0.2 Median : 0.142
Mean : 80.73 Mean : 1624 Mean : 0.3 Mean : 0.201
3rd Qu.: 118.12 3rd Qu.: 1518 3rd Qu.: 0.3 3rd Qu.: 0.193
Max. :2103.79 Max. :1323464 Max. :250.7 Max. :24.407
NA's :322734 NA's :322734 NA's :6528
TD_Mi_q2 TD_Mi_q98 TD_Mi_SS_q5 TD_Mi_SS_q95
Min. :0.05208 Min. :0.9585 Min. : 0.000 Min. : 0.000
1st Qu.:0.05208 1st Qu.:0.9585 1st Qu.: 0.086 1st Qu.: 0.262
Median :0.05208 Median :0.9585 Median : 0.104 Median : 0.326
Mean :0.05208 Mean :0.9585 Mean : 0.164 Mean : 0.488
3rd Qu.:0.05208 3rd Qu.:0.9585 3rd Qu.: 0.139 3rd Qu.: 0.436
Max. :0.05208 Max. :0.9585 Max. :219.163 Max. :246.949
NA's :24757 NA's :24757
TD_Mi_SSHG_q5 TD_Mi_SSHG_q95 TD_Mi_Mean TD_Mi_Mean_F
Min. : 0.00 Min. : 0.00 Min. :0.3076 Min. :0.2318
1st Qu.: 0.09 1st Qu.: 0.25 1st Qu.:0.3076 1st Qu.:0.2318
Median : 0.11 Median : 0.31 Median :0.3076 Median :0.2318
Mean : 0.18 Mean : 0.47 Mean :0.3076 Mean :0.2318
3rd Qu.: 0.15 3rd Qu.: 0.42 3rd Qu.:0.3076 3rd Qu.:0.2318
Max. :250.66 Max. :250.66 Max. :0.3076 Max. :0.2318
NA's :35629 NA's :35629
TD_Mi_SS_Mean TD_Mi_SS_Mean_F TD_Mi_SSHG_Mean TD_Mi_SSHG_Mean_F
Min. : 0.000 Min. : 0.000 Min. : 0.00 Min. : 0.00
1st Qu.: 0.172 1st Qu.: 0.166 1st Qu.: 0.17 1st Qu.: 0.16
Median : 0.212 Median : 0.207 Median : 0.21 Median : 0.21
Mean : 0.307 Mean : 0.291 Mean : 0.31 Mean : 0.29
3rd Qu.: 0.267 3rd Qu.: 0.260 3rd Qu.: 0.27 3rd Qu.: 0.26
Max. :219.163 Max. :219.163 Max. :250.66 Max. :250.66
NA's :24757 NA's :27919 NA's :35629 NA's :44458
TD_Mi_Med TD_Mi_Med_F TD_Mi_SS_Med TD_Mi_SS_Med_F
Min. :0.1977 Min. :0.1977 Min. : 0.000 Min. : 0.000
1st Qu.:0.1977 1st Qu.:0.1977 1st Qu.: 0.146 1st Qu.: 0.146
Median :0.1977 Median :0.1977 Median : 0.196 Median : 0.196
Mean :0.1977 Mean :0.1977 Mean : 0.288 Mean : 0.282
3rd Qu.:0.1977 3rd Qu.:0.1977 3rd Qu.: 0.265 3rd Qu.: 0.265
Max. :0.1977 Max. :0.1977 Max. :219.163 Max. :219.163
NA's :24757 NA's :27919
TD_Mi_SSHG_Med TD_Mi_SSHG_Med_F TD_Mi_Cnt TD_Mi_Cnt_F
Min. : 0.00 Min. : 0.00 Min. :2486795 Min. :2387406
1st Qu.: 0.14 1st Qu.: 0.14 1st Qu.:2486795 1st Qu.:2387406
Median : 0.20 Median : 0.20 Median :2486795 Median :2387406
Mean : 0.29 Mean : 0.28 Mean :2486795 Mean :2387406
3rd Qu.: 0.27 3rd Qu.: 0.27 3rd Qu.:2486795 3rd Qu.:2387406
Max. :250.66 Max. :250.66 Max. :2486795 Max. :2387406
NA's :35629 NA's :44458
TD_Mi_SS_Cnt TD_Mi_SS_Cnt_F TD_Mi_SSHG_Cnt TD_Mi_SSHG_Cnt_F
Min. : 0.0 Min. : 0.0 Min. : 0.00 Min. : 0.00
1st Qu.: 163.0 1st Qu.: 146.0 1st Qu.: 26.00 1st Qu.: 22.00
Median : 280.0 Median : 252.0 Median : 45.00 Median : 39.00
Mean : 347.4 Mean : 312.7 Mean : 57.27 Mean : 50.85
3rd Qu.: 456.0 3rd Qu.: 411.0 3rd Qu.: 75.00 3rd Qu.: 67.00
Max. :1543.0 Max. :1388.0 Max. :663.00 Max. :595.00
TravelTime_Sec TT_Sec_q2 TT_Sec_q98 TT_Sec_SS_q5 TT_Sec_SS_q95
Min. : 1.0 Min. :10 Min. :349 Min. : 1.00 Min. : 1.00
1st Qu.: 25.0 1st Qu.:10 1st Qu.:349 1st Qu.: 15.00 1st Qu.: 48.00
Median : 39.0 Median :10 Median :349 Median : 22.00 Median : 80.05
Mean : 104.9 Mean :10 Mean :349 Mean : 61.26 Mean : 183.28
3rd Qu.: 72.0 3rd Qu.:10 3rd Qu.:349 3rd Qu.: 34.00 3rd Qu.: 134.60
Max. :60750.0 Max. :10 Max. :349 Max. :60750.00 Max. :60750.00
NA's :6641 NA's :6531 NA's :6531
TT_Sec_SSHG_q5 TT_Sec_SSHG_q95 TT_Sec_Mean TT_Sec_Mean_F
Min. : 1.00 Min. : 1.00 Min. :104.9 Min. :56.61
1st Qu.: 16.00 1st Qu.: 43.80 1st Qu.:104.9 1st Qu.:56.61
Median : 23.40 Median : 72.95 Median :104.9 Median :56.61
Mean : 67.33 Mean : 169.21 Mean :104.9 Mean :56.61
3rd Qu.: 36.70 3rd Qu.: 123.65 3rd Qu.:104.9 3rd Qu.:56.61
Max. :60750.00 Max. :60750.00 Max. :104.9 Max. :56.61
NA's :6535 NA's :6535
TT_Sec_SS_Mean TT_Sec_SS_Mean_F TT_Sec_SSHG_Mean TT_Sec_SSHG_Mean_F
Min. : 1.00 Min. : 1.00 Min. : 1.00 Min. : 1.00
1st Qu.: 29.06 1st Qu.: 27.54 1st Qu.: 28.38 1st Qu.: 27.21
Median : 44.16 Median : 41.91 Median : 43.38 Median : 41.48
Mean : 104.88 Mean : 91.34 Mean : 104.88 Mean : 93.53
3rd Qu.: 73.30 3rd Qu.: 69.25 3rd Qu.: 72.93 3rd Qu.: 70.12
Max. :60750.00 Max. :60750.00 Max. :60750.00 Max. :60750.00
NA's :6531 NA's :10519 NA's :6535 NA's :12811
TT_Sec_Med TT_Sec_Med_F TT_Sec_SS_Med TT_Sec_SS_Med_F
Min. :39 Min. :39 Min. : 1.00 Min. : 1.00
1st Qu.:39 1st Qu.:39 1st Qu.: 26.00 1st Qu.: 26.00
Median :39 Median :39 Median : 39.00 Median : 39.00
Mean :39 Mean :39 Mean : 91.55 Mean : 84.82
3rd Qu.:39 3rd Qu.:39 3rd Qu.: 65.00 3rd Qu.: 65.00
Max. :39 Max. :39 Max. :60750.00 Max. :60750.00
NA's :6531 NA's :10519
TT_Sec_SSHG_Med TT_Sec_SSHG_Med_F TT_Sec_Cnt TT_Sec_Cnt_F
Min. : 1.00 Min. : 1.00 Min. :2802888 Min. :2705189
1st Qu.: 26.00 1st Qu.: 26.00 1st Qu.:2802888 1st Qu.:2705189
Median : 39.00 Median : 38.50 Median :2802888 Median :2705189
Mean : 94.94 Mean : 88.44 Mean :2802888 Mean :2705189
3rd Qu.: 67.00 3rd Qu.: 66.50 3rd Qu.:2802888 3rd Qu.:2705189
Max. :60750.00 Max. :60750.00 Max. :2802888 Max. :2705189
NA's :6535 NA's :12811
TT_Sec_SS_Cnt TT_Sec_SS_Cnt_F TT_Sec_SSHG_Cnt TT_Sec_SSHG_Cnt_F
Min. : 0.0 Min. : 0.0 Min. : 0.00 Min. : 0.00
1st Qu.: 194.0 1st Qu.: 177.0 1st Qu.: 29.00 1st Qu.: 26.00
Median : 310.0 Median : 282.0 Median : 51.00 Median : 46.00
Mean : 384.4 Mean : 349.8 Mean : 63.46 Mean : 57.09
3rd Qu.: 497.0 3rd Qu.: 452.0 3rd Qu.: 83.00 3rd Qu.: 74.00
Max. :1664.0 Max. :1523.0 Max. :691.00 Max. :634.00
TravelTime_Hr TT_Hr_q2 TT_Hr_q98 TT_Hr_SS_q5
Min. : 0.000 Min. :0.002778 Min. :0.09694 Min. : 0.000
1st Qu.: 0.007 1st Qu.:0.002778 1st Qu.:0.09694 1st Qu.: 0.004
Median : 0.011 Median :0.002778 Median :0.09694 Median : 0.006
Mean : 0.029 Mean :0.002778 Mean :0.09694 Mean : 0.017
3rd Qu.: 0.020 3rd Qu.:0.002778 3rd Qu.:0.09694 3rd Qu.: 0.009
Max. :16.875 Max. :0.002778 Max. :0.09694 Max. :16.875
NA's :6641 NA's :6531
TT_Hr_SS_q95 TT_Hr_SSHG_q5 TT_Hr_SSHG_q95 TT_Hr_Mean
Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. :0.02913
1st Qu.: 0.013 1st Qu.: 0.004 1st Qu.: 0.012 1st Qu.:0.02913
Median : 0.022 Median : 0.006 Median : 0.020 Median :0.02913
Mean : 0.051 Mean : 0.019 Mean : 0.047 Mean :0.02913
3rd Qu.: 0.037 3rd Qu.: 0.010 3rd Qu.: 0.034 3rd Qu.:0.02913
Max. :16.875 Max. :16.875 Max. :16.875 Max. :0.02913
NA's :6531 NA's :6535 NA's :6535
TT_Hr_Mean_F TT_Hr_SS_Mean TT_Hr_SS_Mean_F TT_Hr_SSHG_Mean
Min. :0.01573 Min. : 0.000 Min. : 0.000 Min. : 0.000
1st Qu.:0.01573 1st Qu.: 0.008 1st Qu.: 0.008 1st Qu.: 0.008
Median :0.01573 Median : 0.012 Median : 0.012 Median : 0.012
Mean :0.01573 Mean : 0.029 Mean : 0.025 Mean : 0.029
3rd Qu.:0.01573 3rd Qu.: 0.020 3rd Qu.: 0.019 3rd Qu.: 0.020
Max. :0.01573 Max. :16.875 Max. :16.875 Max. :16.875
NA's :6531 NA's :10532 NA's :6535
TT_Hr_SSHG_Mean_F TT_Hr_Med TT_Hr_Med_F TT_Hr_SS_Med
Min. : 0.000 Min. :0.01083 Min. :0.01083 Min. : 0.000
1st Qu.: 0.008 1st Qu.:0.01083 1st Qu.:0.01083 1st Qu.: 0.007
Median : 0.012 Median :0.01083 Median :0.01083 Median : 0.011
Mean : 0.026 Mean :0.01083 Mean :0.01083 Mean : 0.025
3rd Qu.: 0.019 3rd Qu.:0.01083 3rd Qu.:0.01083 3rd Qu.: 0.018
Max. :16.875 Max. :0.01083 Max. :0.01083 Max. :16.875
NA's :12895 NA's :6531
TT_Hr_SS_Med_F TT_Hr_SSHG_Med TT_Hr_SSHG_Med_F TT_Hr_Cnt
Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. :2802888
1st Qu.: 0.007 1st Qu.: 0.007 1st Qu.: 0.007 1st Qu.:2802888
Median : 0.011 Median : 0.011 Median : 0.011 Median :2802888
Mean : 0.024 Mean : 0.026 Mean : 0.025 Mean :2802888
3rd Qu.: 0.018 3rd Qu.: 0.019 3rd Qu.: 0.018 3rd Qu.:2802888
Max. :16.875 Max. :16.875 Max. :16.875 Max. :2802888
NA's :10532 NA's :6535 NA's :12895
TT_Hr_Cnt_F TT_Hr_SS_Cnt TT_Hr_SS_Cnt_F TT_Hr_SSHG_Cnt
Min. :2705189 Min. : 0.0 Min. : 0.0 Min. : 0.00
1st Qu.:2705189 1st Qu.: 194.0 1st Qu.: 176.0 1st Qu.: 29.00
Median :2705189 Median : 310.0 Median : 282.0 Median : 51.00
Mean :2705189 Mean : 384.4 Mean : 349.6 Mean : 63.46
3rd Qu.:2705189 3rd Qu.: 497.0 3rd Qu.: 452.0 3rd Qu.: 83.00
Max. :2705189 Max. :1664.0 Max. :1523.0 Max. :691.00
TT_Hr_SSHG_Cnt_F SpeedAvg_Mph
Min. : 0.00 Min. : 0.0
1st Qu.: 26.00 1st Qu.: 10.1
Median : 46.00 Median : 16.7
Mean : 57.05 Mean : 26.5
3rd Qu.: 74.00 3rd Qu.: 31.2
Max. :634.00 Max. :22924.1
NA's :322762
Investigation of TravelDistance_Mi.
View(TravDistMi_Pctiles): 99% of TravelDistance_Mi are about 1 mile or less…but some weird TravelDistance_Mi values (e.g., 584 miles traveled) exist.
TravDistMi_Ntile <- as.data.frame(AllDays_NewOrder$TravelDistance_Mi) %>%
mutate(#Pctile = ntile(AllDays_NewOrder$TravelDistance_Mi, 100),
#MinR = min_rank(AllDays_NewOrder$TravelDistance_Mi),
PctR = percent_rank(AllDays_NewOrder$TravelDistance_Mi),
PctR_Round = round(PctR, 2)
)
colnames(TravDistMi_Ntile)[1] <- "TravelDistance_Mi"
# str(TravDistMi_Ntile)
TravDistMi_Ntile_Rows <- nrow(TravDistMi_Ntile)
# View(tail(TravDistMi_Ntile, 500))
TravDistMi_Pctiles <- group_by(TravDistMi_Ntile,
PctR_Round
) %>%
summarise(
MinTravDistMiAtPctile = min(TravelDistance_Mi),
CntsAtPctile = n(),
PctsAtPctile = CntsAtPctile / TravDistMi_Ntile_Rows
) %>%
mutate(CumSumPAtP = cumsum(PctsAtPctile)
)
rm(TravDistMi_Ntile)
rm(TravDistMi_Ntile_Rows)
View(TravDistMi_Pctiles)
TravDistMi_Pctiles
Investigation of TravelDistance_Mi.
Why are some TravelDistance_Mi “NA”? It looks like partially because the records are the first trip of the day (for that bus), so I purposefully set the distance to “NA”. Another reason is due to the odometer recording a value less than the previous odometer recording. In most cases, I have no explanation for this - though I have observed about 67% of all instances where TravelDistance_Mi is NA (other than because it’s the first record of the day) are instances where DirChange2 is “Change”. This is weird and should be asked to WMATA.
Investigation of TravelDistance_Mi.
These records are NA becuase the current record odometer is less than the previous record odometer. Theoretically, this should NOT happen. Me: it appears that about 67% of all instances where TravelDistance_Mi is NA (other than because it’s th first record of the day) are instances where DirChange2 is “Change”. This is weird and should be asked to WMATA.
View(filter(AllDays_NewOrder,
between(RowNum_OG, 194, 214) | # 204
between(RowNum_OG, 440, 460) | # 450
between(RowNum_OG, 478, 498) | # 488
between(RowNum_OG, 510, 530) # 520
)
)
TestTable <- filter(AllDays_NewOrder,
BusDay_EventNum != 1
) %>%
mutate(TravelDistance_NA = as.factor(ifelse(is.na(TravelDistance_Mi),
"True",
"False"
)
)
) %>%
group_by(DirChange2, TravelDistance_NA) %>%
summarise(TravDistMi_NACnts = n()
)
# TestTable
TestTable_Spread <- as.data.frame(spread(TestTable,
TravelDistance_NA,
TravDistMi_NACnts
)
) %>%
select(False,
True
)
row.names(TestTable_Spread) <- c("Change", "Same")
# str(TestTable_Spread)
# TestTable_Spread
prop.table(as.table(as.matrix(TestTable_Spread)
),
1
)
False True
Change 0.8267006 0.1732994
Same 0.8884818 0.1115182
prop.table(as.table(as.matrix(TestTable_Spread)
),
2
)
False True
Change 0.01948009 0.03211514
Same 0.98051991 0.96788486
Investigation of TravelDistance_Mi.
Let’s look at just the TravelDistance_Mi values that are NOT “NA”.
rm(TestTable, TestTable_Spread)
TravelDistance_Mi_NoNA <- filter(AllDays_NewOrder,
# TravelDistance_Mi != 0 &
!is.na(TravelDistance_Mi)
)
dim(AllDays_NewOrder)
[1] 2809529 120
dim(TravelDistance_Mi_NoNA)
[1] 2486795 120
nrow(AllDays_NewOrder) - nrow(TravelDistance_Mi_NoNA)
[1] 322734
str(TravelDistance_Mi_NoNA)
'data.frame': 2486795 obs. of 120 variables:
$ RowNum_OG : int 3 4 5 6 7 9 10 11 12 13 ...
$ UniqueLatLng : chr "38.769363__-77.157082" "38.769341__-77.155136" "38.766953__-77.155113" "38.766769__-77.169312" ...
$ group : Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
$ StartStop_ID : chr "5004572--5004573" "5004573--5002210" "5002210--5002209" "5002209--5000070" ...
$ BusDay_EventNum : int 2 3 4 5 6 7 8 9 10 11 ...
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : chr "S80" "S80" "S80" "S80" ...
$ RteChange2 : Factor w/ 2 levels "Change","Same": 2 2 2 2 2 2 2 2 2 2 ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 1 1 1 6 6 6 6 6 ...
$ DirChange2 : Factor w/ 2 levels "Change","Same": 2 2 2 2 2 1 2 2 2 2 ...
$ Route_Direction : Factor w/ 12 levels "","ANTICLKW",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Stop_Sequence : int 6 3 2 8 1 2 3 4 2 6 ...
$ Start_ID : chr "5004572" "5004573" "5002210" "5002209" ...
$ Start_Desc : chr "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" "BEULAH ST + CHARLES ARRINGTON DR" ...
$ StopID_Clean : chr "5004573" "5002210" "5002209" "5000070" ...
$ StopID_Indicator : Factor w/ 2 levels "ID_Bad","ID_OK": 2 2 2 2 2 2 2 2 2 2 ...
$ Stop_Desc : chr "WALKER LN + #6363" "WALKER LN + BEULAH ST" "BEULAH ST + CHARLES ARRINGTON DR" "FRANCONIA-SPRGFLD STA. + BUS BAY D" ...
$ countryCode : Factor w/ 1 level "US": 1 1 1 1 1 1 1 1 1 1 ...
$ Stop_State : Factor w/ 3 levels "DC","MD","VA": 3 3 3 3 3 3 3 3 3 3 ...
$ Stop_County : Factor w/ 11 levels "Anne Arundel",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Stop_City : Factor w/ 56 levels "Accokeek","Alexandria",..: 2 2 2 49 49 49 49 49 49 49 ...
$ Stop_Zip : Factor w/ 153 levels "20001","20002",..: 150 150 150 123 123 123 123 123 123 123 ...
$ Event_Type : int 4 4 4 3 3 4 4 4 4 4 ...
$ Event_Description : Factor w/ 3 levels "Serviced Stop ",..: 3 3 3 1 1 3 3 3 3 3 ...
$ Event_Time_Yr : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
$ Event_Time_Mth : int 10 10 10 10 10 10 10 10 10 10 ...
$ Event_Time_Date : int 3 3 3 3 3 3 3 3 3 3 ...
$ Event_Time_Day : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tues"<..: 2 2 2 2 2 2 2 2 2 2 ...
$ Event_Time_Hr : int 6 6 6 6 6 6 6 6 6 6 ...
$ Event_Time_HrGroup : Ord.factor w/ 8 levels "Group0_2"<"Group3_5"<..: 3 3 3 3 3 3 3 3 3 3 ...
$ Event_Time_Min : int 9 10 10 13 14 21 21 23 23 26 ...
$ Event_Time : POSIXct, format: "2016-10-03 06:09:47" "2016-10-03 06:10:24" ...
$ Departure_Time : POSIXct, format: "2016-10-03 06:09:47" "2016-10-03 06:10:24" ...
$ Dwell_Time : int 0 0 0 0 104 0 0 0 0 0 ...
$ Dwell_Time2 : num 0 0 0 0 104 0 0 0 0 0 ...
$ Delta_Time : int 24 165 25 73 719 74 76 63 69 165 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 97 276 15 119 100 274 104 241 274 1 ...
$ Odometer_Distance : int 45139 46418 50115 51074 51303 55633 56163 56285 57262 58363 ...
$ Odometer_Distance_Lag1: int 43543 45139 46418 50115 51074 51303 55633 56163 56285 57262 ...
$ Odometer_Distance_Mi : num 8.55 8.79 9.49 9.67 9.72 ...
$ TravelDistance_Ft : int 1596 1279 3697 959 229 4330 530 122 977 1101 ...
$ TravelDistance_Mi : num 0.3023 0.2422 0.7002 0.1816 0.0434 ...
$ TravelDistance_Mi_Hvrs: num 0.15 0.105 0.165 0.832 0.068 ...
$ TD_Mi_q2 : num 0.0521 0.0521 0.0521 0.0521 0.0521 ...
$ TD_Mi_q98 : num 0.959 0.959 0.959 0.959 0.959 ...
$ TD_Mi_SS_q5 : num 0.025246 0.242235 0.732434 0.079432 0.000436 ...
$ TD_Mi_SS_q95 : num 0.626 0.242 1.008 0.176 10.435 ...
$ TD_Mi_SSHG_q5 : num 0.09956 0.24223 0.70019 0.18163 0.00269 ...
$ TD_Mi_SSHG_q95 : num 0.627 0.242 0.7 0.182 0.497 ...
$ TD_Mi_Mean : num 0.308 0.308 0.308 0.308 0.308 ...
$ TD_Mi_Mean_F : num 0.232 0.232 0.232 0.232 0.232 ...
$ TD_Mi_SS_Mean : num 0.437 0.242 0.908 0.128 1.166 ...
$ TD_Mi_SS_Mean_F : num 0.457 0.242 0.977 NaN 0.226 ...
$ TD_Mi_SSHG_Mean : num 0.442 0.242 0.7 0.182 0.232 ...
$ TD_Mi_SSHG_Mean_F : num 0.491 0.242 0.7 0.182 0.228 ...
$ TD_Mi_Med : num 0.198 0.198 0.198 0.198 0.198 ...
$ TD_Mi_Med_F : num 0.198 0.198 0.198 0.198 0.198 ...
$ TD_Mi_SS_Med : num 0.5116 0.2422 0.9616 0.1278 0.0426 ...
$ TD_Mi_SS_Med_F : num 0.5116 0.2422 1.0081 NA 0.0426 ...
$ TD_Mi_SSHG_Med : num 0.512 0.242 0.7 0.182 0.108 ...
$ TD_Mi_SSHG_Med_F : num 0.512 0.242 0.7 0.182 0.108 ...
$ TD_Mi_Cnt : int 2486795 2486795 2486795 2486795 2486795 2486795 2486795 2486795 2486795 2486795 ...
$ TD_Mi_Cnt_F : int 2387406 2387406 2387406 2387406 2387406 2387406 2387406 2387406 2387406 2387406 ...
$ TD_Mi_SS_Cnt : int 14 1 4 2 87 22 118 91 11 2 ...
$ TD_Mi_SS_Cnt_F : int 12 1 3 0 77 18 106 81 9 0 ...
$ TD_Mi_SSHG_Cnt : int 7 1 1 1 23 6 29 28 3 1 ...
$ TD_Mi_SSHG_Cnt_F : int 5 1 1 1 19 4 25 24 1 1 ...
$ TravelTime_Sec : num 180 37 25 190 29 288 52 76 8 189 ...
$ TT_Sec_q2 : num 10 10 10 10 10 10 10 10 10 10 ...
$ TT_Sec_q98 : num 349 349 349 349 349 349 349 349 349 349 ...
$ TT_Sec_SS_q5 : num 11.9 37 30.5 172.9 10 ...
$ TT_Sec_SS_q95 : num 346.3 37 75.8 189.1 1737.2 ...
$ TT_Sec_SSHG_q5 : num 59.6 37 25 190 11.6 236 51.5 55 8.8 189 ...
$ TT_Sec_SSHG_q95 : num 276 37 25 190 675 ...
$ TT_Sec_Mean : num 105 105 105 105 105 ...
$ TT_Sec_Mean_F : num 56.6 56.6 56.6 56.6 56.6 ...
$ TT_Sec_SS_Mean : num 215.8 37 58.2 181 585.3 ...
$ TT_Sec_SS_Mean_F : num 218.9 37 65.5 NaN 249.3 ...
$ TT_Sec_SSHG_Mean : num 202 37 25 190 257 ...
$ TT_Sec_SSHG_Mean_F : num 226 37 25 190 244 ...
$ TT_Sec_Med : num 39 39 39 39 39 39 39 39 39 39 ...
$ TT_Sec_Med_F : num 39 39 39 39 39 39 39 39 39 39 ...
$ TT_Sec_SS_Med : num 223.5 37 65.5 181 33 ...
$ TT_Sec_SS_Med_F : num 223.5 37 65.5 NA 32 ...
$ TT_Sec_SSHG_Med : num 219 37 25 190 134 286 60 65 16 189 ...
$ TT_Sec_SSHG_Med_F : num 219 37 25 190 134 286 60 65 16 189 ...
$ TT_Sec_Cnt : int 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 ...
$ TT_Sec_Cnt_F : int 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 ...
$ TT_Sec_SS_Cnt : int 14 1 4 2 173 22 141 141 11 2 ...
$ TT_Sec_SS_Cnt_F : int 12 1 2 0 156 18 127 128 9 0 ...
$ TT_Sec_SSHG_Cnt : int 7 1 1 1 35 6 36 35 3 1 ...
$ TT_Sec_SSHG_Cnt_F : int 5 1 1 1 31 4 32 32 1 1 ...
$ TravelTime_Hr : num 0.05 0.01028 0.00694 0.05278 0.00806 ...
$ TT_Hr_q2 : num 0.00278 0.00278 0.00278 0.00278 0.00278 ...
$ TT_Hr_q98 : num 0.0969 0.0969 0.0969 0.0969 0.0969 ...
$ TT_Hr_SS_q5 : num 0.00331 0.01028 0.00849 0.04803 0.00278 ...
$ TT_Hr_SS_q95 : num 0.0962 0.0103 0.0211 0.0525 0.4826 ...
[list output truncated]
summary(TravelDistance_Mi_NoNA)
RowNum_OG UniqueLatLng group StartStop_ID
Min. : 3 Length:2486795 1:496190 Length:2486795
1st Qu.: 786568 Class :character 2:497932 Class :character
Median :1590497 Mode :character 3:501611 Mode :character
Mean :1578192 4:495069
3rd Qu.:2351264 5:495993
Max. :3119443
BusDay_EventNum Bus_ID Route RteChange2
Min. : 2.0 Min. : 11 Length:2486795 Change: 13709
1st Qu.: 115.0 1st Qu.:2923 Class :character Same :2473086
Median : 251.0 Median :6202 Mode :character
Mean : 293.2 Mean :5431
3rd Qu.: 431.0 3rd Qu.:7113
Max. :1344.0 Max. :8105
RouteAlt DirChange2 Route_Direction Stop_Sequence
2 :994645 Change: 48443 SOUTH :667198 Min. : 1.00
1 :943279 Same :2438352 NORTH :662471 1st Qu.: 12.00
3 :229032 WEST :565616 Median : 24.00
4 :117090 EAST :543386 Mean : 27.13
5 : 67811 LOOP : 33484 3rd Qu.: 39.00
6 : 51391 CLOCKWIS: 7012 Max. :104.00
(Other): 83547 (Other) : 7628
Start_ID Start_Desc StopID_Clean StopID_Indicator
Length:2486795 Length:2486795 Length:2486795 ID_Bad: 14271
Class :character Class :character Class :character ID_OK :2472524
Mode :character Mode :character Mode :character
Stop_Desc countryCode Stop_State Stop_County
Length:2486795 US :2485808 DC :1148297 District of Columbia:1148297
Class :character NA's: 987 MD : 872720 Prince George's : 525575
Mode :character VA : 464791 Montgomery : 345558
NA's: 987 Fairfax : 178174
Arlington : 176087
(Other) : 112117
NA's : 987
Stop_City Stop_Zip Event_Type
Washington :1147938 20020 : 136778 Min. :3.000
Silver Spring: 201381 20019 : 110150 1st Qu.:3.000
Arlington : 175923 20032 : 106396 Median :4.000
Hyattsville : 151249 20011 : 105580 Mean :3.626
Alexandria : 90916 20002 : 91962 3rd Qu.:4.000
(Other) : 718401 (Other):1934942 Max. :5.000
NA's : 987 NA's : 987
Event_Description Event_Time_Yr
Serviced Stop : 930934 Min. :2016
Unknown Stop : 1794 1st Qu.:2016
UnServiced Stop :1554067 Median :2016
Mean :2016
3rd Qu.:2016
Max. :2016
Event_Time_Mth Event_Time_Date Event_Time_Day Event_Time_Hr Event_Time_HrGroup
Min. :10 Min. :3.000 Sun : 0 Min. : 0.00 Group6_8 :538348
1st Qu.:10 1st Qu.:4.000 Mon :496190 1st Qu.: 8.00 Group15_17:497156
Median :10 Median :5.000 Tues :497932 Median :13.00 Group18_20:408957
Mean :10 Mean :4.999 Wed :501611 Mean :12.99 Group9_11 :351804
3rd Qu.:10 3rd Qu.:6.000 Thurs:495069 3rd Qu.:18.00 Group12_14:314050
Max. :10 Max. :7.000 Fri :495993 Max. :23.00 Group21_23:217259
Sat : 0 (Other) :159221
Event_Time_Min Event_Time Departure_Time
Min. : 0.00 Min. :2016-10-03 00:00:09 Min. :2016-10-03 00:00:09
1st Qu.:14.00 1st Qu.:2016-10-04 08:35:52 1st Qu.:2016-10-04 08:35:59
Median :29.00 Median :2016-10-05 13:46:00 Median :2016-10-05 13:46:06
Mean :29.43 Mean :2016-10-05 13:27:43 Mean :2016-10-05 13:27:49
3rd Qu.:44.00 3rd Qu.:2016-10-06 17:57:32 3rd Qu.:2016-10-06 17:57:39
Max. :59.00 Max. :2016-10-07 23:59:59 Max. :2016-10-08 00:12:31
Dwell_Time Dwell_Time2 Delta_Time Latitude
Min. : 0.00 Min. : 0.000 Min. :-5606.0 Min. : 0.00
1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 16.0 1st Qu.:38.86
Median : 0.00 Median : 0.000 Median : 160.0 Median :38.90
Mean : 11.86 Mean : 5.994 Mean : 274.1 Mean :38.91
3rd Qu.: 4.00 3rd Qu.: 4.000 3rd Qu.: 402.0 3rd Qu.:38.96
Max. :6205.00 Max. :6205.000 Max. : 9426.0 Max. :39.19
Longitude Heading Odometer_Distance Odometer_Distance_Lag1
Min. :-77.45 Min. : 0.0 Min. : 1 Min. : 0
1st Qu.:-77.07 1st Qu.: 89.0 1st Qu.: 200268 1st Qu.: 198635
Median :-77.01 Median :180.0 Median : 394700 Median : 393026
Mean :-77.02 Mean :176.7 Mean : 443225 Mean : 441601
3rd Qu.:-76.97 3rd Qu.:269.0 3rd Qu.: 633936 3rd Qu.: 632313
Max. : 0.00 Max. :360.0 Max. :11108034 Max. :10853226
Odometer_Distance_Mi TravelDistance_Ft TravelDistance_Mi TravelDistance_Mi_Hvrs
Min. : 0.0002 Min. : 1 Min. : 0.00019 Min. : 0.0000
1st Qu.: 37.9295 1st Qu.: 699 1st Qu.: 0.13239 1st Qu.: 0.1034
Median : 74.7538 Median : 1044 Median : 0.19773 Median : 0.1378
Mean : 83.9442 Mean : 1624 Mean : 0.30760 Mean : 0.1918
3rd Qu.: 120.0635 3rd Qu.: 1518 3rd Qu.: 0.28750 3rd Qu.: 0.1828
Max. :2103.7943 Max. :1323464 Max. :250.65606 Max. :24.1507
TD_Mi_q2 TD_Mi_q98 TD_Mi_SS_q5 TD_Mi_SS_q95
Min. :0.05208 Min. :0.9585 Min. : 0.00019 Min. : 0.00019
1st Qu.:0.05208 1st Qu.:0.9585 1st Qu.: 0.08848 1st Qu.: 0.25878
Median :0.05208 Median :0.9585 Median : 0.10608 Median : 0.32239
Mean :0.05208 Mean :0.9585 Mean : 0.16872 Mean : 0.47949
3rd Qu.:0.05208 3rd Qu.:0.9585 3rd Qu.: 0.13977 3rd Qu.: 0.42822
Max. :0.05208 Max. :0.9585 Max. :219.16288 Max. :246.94938
TD_Mi_SSHG_q5 TD_Mi_SSHG_q95 TD_Mi_Mean TD_Mi_Mean_F
Min. : 0.00019 Min. : 0.00019 Min. :0.3076 Min. :0.2318
1st Qu.: 0.09167 1st Qu.: 0.24754 1st Qu.:0.3076 1st Qu.:0.2318
Median : 0.11395 Median : 0.31174 Median :0.3076 Median :0.2318
Mean : 0.18528 Mean : 0.46625 Mean :0.3076 Mean :0.2318
3rd Qu.: 0.15093 3rd Qu.: 0.41899 3rd Qu.:0.3076 3rd Qu.:0.2318
Max. :250.65606 Max. :250.65606 Max. :0.3076 Max. :0.2318
TD_Mi_SS_Mean TD_Mi_SS_Mean_F TD_Mi_SSHG_Mean TD_Mi_SSHG_Mean_F
Min. : 0.00019 Min. : 0.0002 Min. : 0.00019 Min. : 0.000
1st Qu.: 0.17129 1st Qu.: 0.1663 1st Qu.: 0.16760 1st Qu.: 0.163
Median : 0.21082 Median : 0.2058 Median : 0.20965 Median : 0.206
Mean : 0.30760 Mean : 0.2916 Mean : 0.30760 Mean : 0.294
3rd Qu.: 0.26422 3rd Qu.: 0.2582 3rd Qu.: 0.26616 3rd Qu.: 0.262
Max. :219.16288 Max. :219.1629 Max. :250.65606 Max. :250.656
NA's :2678 NA's :4904
TD_Mi_Med TD_Mi_Med_F TD_Mi_SS_Med TD_Mi_SS_Med_F
Min. :0.1977 Min. :0.1977 Min. : 0.00019 Min. : 0.0002
1st Qu.:0.1977 1st Qu.:0.1977 1st Qu.: 0.14602 1st Qu.: 0.1458
Median :0.1977 Median :0.1977 Median : 0.19470 Median : 0.1947
Mean :0.1977 Mean :0.1977 Mean : 0.28931 Mean : 0.2827
3rd Qu.:0.1977 3rd Qu.:0.1977 3rd Qu.: 0.26326 3rd Qu.: 0.2633
Max. :0.1977 Max. :0.1977 Max. :219.16288 Max. :219.1629
NA's :2678
TD_Mi_SSHG_Med TD_Mi_SSHG_Med_F TD_Mi_Cnt TD_Mi_Cnt_F
Min. : 0.00019 Min. : 0.000 Min. :2486795 Min. :2387406
1st Qu.: 0.14403 1st Qu.: 0.144 1st Qu.:2486795 1st Qu.:2387406
Median : 0.19527 Median : 0.195 Median :2486795 Median :2387406
Mean : 0.29152 Mean : 0.285 Mean :2486795 Mean :2387406
3rd Qu.: 0.26657 3rd Qu.: 0.266 3rd Qu.:2486795 3rd Qu.:2387406
Max. :250.65606 Max. :250.656 Max. :2486795 Max. :2387406
NA's :4904
TD_Mi_SS_Cnt TD_Mi_SS_Cnt_F TD_Mi_SSHG_Cnt TD_Mi_SSHG_Cnt_F
Min. : 1.0 Min. : 0.0 Min. : 1.00 Min. : 0.00
1st Qu.: 178.0 1st Qu.: 160.0 1st Qu.: 28.00 1st Qu.: 24.00
Median : 295.0 Median : 266.0 Median : 48.00 Median : 42.00
Mean : 363.3 Mean : 327.1 Mean : 60.01 Mean : 53.31
3rd Qu.: 476.0 3rd Qu.: 428.0 3rd Qu.: 78.00 3rd Qu.: 70.00
Max. :1543.0 Max. :1388.0 Max. :663.00 Max. :595.00
TravelTime_Sec TT_Sec_q2 TT_Sec_q98 TT_Sec_SS_q5 TT_Sec_SS_q95
Min. : 1 Min. :10 Min. :349 Min. : 1.00 Min. : 1.00
1st Qu.: 24 1st Qu.:10 1st Qu.:349 1st Qu.: 15.00 1st Qu.: 47.00
Median : 38 Median :10 Median :349 Median : 21.00 Median : 77.75
Mean : 100 Mean :10 Mean :349 Mean : 57.38 Mean : 176.22
3rd Qu.: 70 3rd Qu.:10 3rd Qu.:349 3rd Qu.: 32.00 3rd Qu.: 129.65
Max. :54551 Max. :10 Max. :349 Max. :54551.00 Max. :54551.00
NA's :28
TT_Sec_SSHG_q5 TT_Sec_SSHG_q95 TT_Sec_Mean TT_Sec_Mean_F
Min. : 1.00 Min. : 1.00 Min. :104.9 Min. :56.61
1st Qu.: 15.20 1st Qu.: 42.70 1st Qu.:104.9 1st Qu.:56.61
Median : 22.50 Median : 70.55 Median :104.9 Median :56.61
Mean : 62.94 Mean : 161.25 Mean :104.9 Mean :56.61
3rd Qu.: 34.80 3rd Qu.: 119.60 3rd Qu.:104.9 3rd Qu.:56.61
Max. :54551.00 Max. :54551.00 Max. :104.9 Max. :56.61
TT_Sec_SS_Mean TT_Sec_SS_Mean_F TT_Sec_SSHG_Mean TT_Sec_SSHG_Mean_F
Min. : 1.00 Min. : 1.00 Min. : 1.00 Min. : 1.00
1st Qu.: 28.20 1st Qu.: 26.62 1st Qu.: 27.51 1st Qu.: 26.34
Median : 42.61 Median : 40.46 Median : 41.76 Median : 39.96
Mean : 99.62 Mean : 86.96 Mean : 99.55 Mean : 88.81
3rd Qu.: 69.71 3rd Qu.: 66.44 3rd Qu.: 70.02 3rd Qu.: 67.22
Max. :54551.00 Max. :54551.00 Max. :54551.00 Max. :54551.00
NA's :2603 NA's :3772
TT_Sec_Med TT_Sec_Med_F TT_Sec_SS_Med TT_Sec_SS_Med_F
Min. :39 Min. :39 Min. : 1.00 Min. : 1.00
1st Qu.:39 1st Qu.:39 1st Qu.: 25.00 1st Qu.: 25.00
Median :39 Median :39 Median : 37.00 Median : 37.00
Mean :39 Mean :39 Mean : 86.88 Mean : 80.62
3rd Qu.:39 3rd Qu.:39 3rd Qu.: 62.00 3rd Qu.: 62.00
Max. :39 Max. :39 Max. :54551.00 Max. :54551.00
NA's :2603
TT_Sec_SSHG_Med TT_Sec_SSHG_Med_F TT_Sec_Cnt TT_Sec_Cnt_F
Min. : 1.00 Min. : 1.00 Min. :2802888 Min. :2705189
1st Qu.: 25.00 1st Qu.: 25.00 1st Qu.:2802888 1st Qu.:2705189
Median : 37.00 Median : 37.00 Median :2802888 Median :2705189
Mean : 90.07 Mean : 83.87 Mean :2802888 Mean :2705189
3rd Qu.: 64.00 3rd Qu.: 64.00 3rd Qu.:2802888 3rd Qu.:2705189
Max. :54551.00 Max. :54551.00 Max. :2802888 Max. :2705189
NA's :3772
TT_Sec_SS_Cnt TT_Sec_SS_Cnt_F TT_Sec_SSHG_Cnt TT_Sec_SSHG_Cnt_F
Min. : 1.0 Min. : 0.0 Min. : 1.0 Min. : 0.00
1st Qu.: 200.0 1st Qu.: 183.0 1st Qu.: 30.0 1st Qu.: 27.00
Median : 321.0 Median : 292.0 Median : 52.0 Median : 47.00
Mean : 392.4 Mean : 357.2 Mean : 64.7 Mean : 58.23
3rd Qu.: 509.0 3rd Qu.: 464.0 3rd Qu.: 84.0 3rd Qu.: 76.00
Max. :1664.0 Max. :1523.0 Max. :691.0 Max. :634.00
TravelTime_Hr TT_Hr_q2 TT_Hr_q98 TT_Hr_SS_q5
Min. : 0.000278 Min. :0.002778 Min. :0.09694 Min. : 0.000278
1st Qu.: 0.006667 1st Qu.:0.002778 1st Qu.:0.09694 1st Qu.: 0.004167
Median : 0.010556 Median :0.002778 Median :0.09694 Median : 0.005833
Mean : 0.027782 Mean :0.002778 Mean :0.09694 Mean : 0.015938
3rd Qu.: 0.019444 3rd Qu.:0.002778 3rd Qu.:0.09694 3rd Qu.: 0.008889
Max. :15.153056 Max. :0.002778 Max. :0.09694 Max. :15.153056
NA's :28
TT_Hr_SS_q95 TT_Hr_SSHG_q5 TT_Hr_SSHG_q95 TT_Hr_Mean
Min. : 0.000278 Min. : 0.000278 Min. : 0.000278 Min. :0.02913
1st Qu.: 0.013056 1st Qu.: 0.004222 1st Qu.: 0.011861 1st Qu.:0.02913
Median : 0.021597 Median : 0.006250 Median : 0.019597 Median :0.02913
Mean : 0.048950 Mean : 0.017485 Mean : 0.044792 Mean :0.02913
3rd Qu.: 0.036014 3rd Qu.: 0.009667 3rd Qu.: 0.033222 3rd Qu.:0.02913
Max. :15.153056 Max. :15.153056 Max. :15.153056 Max. :0.02913
TT_Hr_Mean_F TT_Hr_SS_Mean TT_Hr_SS_Mean_F TT_Hr_SSHG_Mean
Min. :0.01573 Min. : 0.000278 Min. : 0.0003 Min. : 0.000278
1st Qu.:0.01573 1st Qu.: 0.007832 1st Qu.: 0.0074 1st Qu.: 0.007643
Median :0.01573 Median : 0.011836 Median : 0.0112 Median : 0.011600
Mean :0.01573 Mean : 0.027673 Mean : 0.0242 Mean : 0.027654
3rd Qu.:0.01573 3rd Qu.: 0.019363 3rd Qu.: 0.0185 3rd Qu.: 0.019450
Max. :0.01573 Max. :15.153056 Max. :15.1531 Max. :15.153056
NA's :2612
TT_Hr_SSHG_Mean_F TT_Hr_Med TT_Hr_Med_F TT_Hr_SS_Med
Min. : 0.000 Min. :0.01083 Min. :0.01083 Min. : 0.000278
1st Qu.: 0.007 1st Qu.:0.01083 1st Qu.:0.01083 1st Qu.: 0.006944
Median : 0.011 Median :0.01083 Median :0.01083 Median : 0.010278
Mean : 0.025 Mean :0.01083 Mean :0.01083 Mean : 0.024132
3rd Qu.: 0.019 3rd Qu.:0.01083 3rd Qu.:0.01083 3rd Qu.: 0.017222
Max. :15.153 Max. :0.01083 Max. :0.01083 Max. :15.153056
NA's :3842
TT_Hr_SS_Med_F TT_Hr_SSHG_Med TT_Hr_SSHG_Med_F TT_Hr_Cnt
Min. : 0.0003 Min. : 0.000278 Min. : 0.000 Min. :2802888
1st Qu.: 0.0069 1st Qu.: 0.006944 1st Qu.: 0.007 1st Qu.:2802888
Median : 0.0103 Median : 0.010278 Median : 0.010 Median :2802888
Mean : 0.0224 Mean : 0.025019 Mean : 0.023 Mean :2802888
3rd Qu.: 0.0172 3rd Qu.: 0.017778 3rd Qu.: 0.018 3rd Qu.:2802888
Max. :15.1531 Max. :15.153056 Max. :15.153 Max. :2802888
NA's :2612 NA's :3842
TT_Hr_Cnt_F TT_Hr_SS_Cnt TT_Hr_SS_Cnt_F TT_Hr_SSHG_Cnt TT_Hr_SSHG_Cnt_F
Min. :2705189 Min. : 1.0 Min. : 0 Min. : 1.0 Min. : 0.00
1st Qu.:2705189 1st Qu.: 200.0 1st Qu.: 183 1st Qu.: 30.0 1st Qu.: 27.00
Median :2705189 Median : 321.0 Median : 292 Median : 52.0 Median : 47.00
Mean :2705189 Mean : 392.4 Mean : 357 Mean : 64.7 Mean : 58.19
3rd Qu.:2705189 3rd Qu.: 509.0 3rd Qu.: 464 3rd Qu.: 84.0 3rd Qu.: 76.00
Max. :2705189 Max. :1664.0 Max. :1523 Max. :691.0 Max. :634.00
SpeedAvg_Mph
Min. : 0.00
1st Qu.: 10.10
Median : 16.68
Mean : 26.54
3rd Qu.: 31.17
Max. :22924.09
NA's :28
Investigation of TravelDistance_Mi.
Let’s plot just the TravelDistance_Mi values that are NOT “NA”.
TravDistMi_HistDen <- ggplot(select(TravelDistance_Mi_NoNA,
TravelDistance_Mi
),
aes(x = TravelDistance_Mi,
y = ..density..
)
) +
geom_histogram(binwidth = 0.05, fill = "lightblue", colour = "grey60", size = 0.2) +
geom_line(stat = "density", colour = "red") +
coord_cartesian(xlim = c(0, 1.5), ylim = c(0, 4.0)
) +
labs(title = "Variation in Distance Between Stops",
x = "Travel Distance (miles)",
y = "Density"
)
TravDistMi_HistDen
Investigation of TravelDistance_Mi.
Looking at the extremely large TravelDistance_Mi values. Some (aprox 27%) of TravelDistance_Mi values > 1 mile are when the DirChange2 changes…but what about the other ~73%?
Investigation of TravelDistance_Mi.
Any relation with DirChange2? Doesn’t look as if this is so.
ExtremeTravDist <- filter(AllDays_NewOrder,
!is.na(TravelDistance_Mi)
) %>%
mutate(TravDist_Extreme = ifelse(TravelDistance_Mi > 1.1587121212, # 1.1587121212 is the 99th percentile
"True",
"False"
)
) %>%
group_by(DirChange2, TravDist_Extreme) %>%
summarise(TravDistMI_ExtCnts = n()
)
# ExtremeTravDist
ExtremeTravDist_Spread <- as.data.frame(spread(ExtremeTravDist,
TravDist_Extreme,
TravDistMI_ExtCnts
)
) %>%
select(False,
True
)
row.names(ExtremeTravDist_Spread) <- c("Change", "Same")
# str(ExtremeTravDist_Spread)
# ExtremeTravDist_Spread
prop.table(as.table(as.matrix(ExtremeTravDist_Spread)
),
1
)
False True
Change 0.80622587 0.19377413
Same 0.98855456 0.01144544
prop.table(as.table(as.matrix(ExtremeTravDist_Spread)
),
2
)
False True
Change 0.01594448 0.25169594
Same 0.98405552 0.74830406
Investigation of TravelDistance_Mi.
Looking at specific buses and StartStop_ID.
Investigation of TravelDistance_Mi & TravelDistance_Mi_New.
If TravelDisntace_Mi is below the 5th percentile for that StartStop_ID, or if TravelDisntace_Mi is above the 95th percentile for that StartStop_ID, or if TravelDistance_Mi is NA (when the BusDay_EventNum !=1), consider this an outlier. In this case, replace the value with the mean for that StartStop_ID and HourGroup (TD_Mi_SSHG_Mean_F), or if there are not enough values at the HourGroup level, replace it with the mean for that StartStop_ID.
# View(tail(AllDays_NewOrder, 500))
AllDays_NewTravelDist <-
mutate(AllDays_NewOrder,
TravelDistance_Mi_New = ifelse(!is.na(TravelDistance_Mi) &
(TravelDistance_Mi < TD_Mi_SSHG_q5 |
TravelDistance_Mi > TD_Mi_SSHG_q95
) &
TD_Mi_SSHG_Cnt_F >= 20,
TD_Mi_SSHG_Mean_F,
ifelse(!is.na(TravelDistance_Mi) &
(TravelDistance_Mi < TD_Mi_SSHG_q5 |
TravelDistance_Mi > TD_Mi_SSHG_q95
) &
TD_Mi_SSHG_Cnt_F < 20 &
TD_Mi_SS_Cnt_F >= 20,
TD_Mi_SS_Mean_F,
ifelse(!is.na(TravelDistance_Mi) &
(TravelDistance_Mi < TD_Mi_SSHG_q5 |
TravelDistance_Mi > TD_Mi_SSHG_q95
) &
TD_Mi_SS_Cnt_F < 20 &
TD_Mi_SS_Cnt >= 20,
TD_Mi_SS_Mean,
ifelse(is.na(TravelDistance_Mi) &
BusDay_EventNum != 1 &
TravelDistance_Mi_Hvrs != 0,
TravelDistance_Mi_Hvrs,
ifelse(is.na(TravelDistance_Mi) &
BusDay_EventNum != 1 &
TravelDistance_Mi_Hvrs == 0,
TD_Mi_SS_Mean,
TravelDistance_Mi
))))),
TravelDistance_Mi_New_Label =
factor(ifelse(!is.na(TravelDistance_Mi) &
(TravelDistance_Mi < TD_Mi_SSHG_q5 |
TravelDistance_Mi > TD_Mi_SSHG_q95
) &
TD_Mi_SSHG_Cnt_F >= 20,
"TD_Mi_SSHG_Mean_F",
ifelse(!is.na(TravelDistance_Mi) &
(TravelDistance_Mi < TD_Mi_SSHG_q5 |
TravelDistance_Mi > TD_Mi_SSHG_q95
) &
TD_Mi_SSHG_Cnt_F < 20 &
TD_Mi_SS_Cnt_F >= 20,
"TD_Mi_SS_Mean_F",
ifelse(!is.na(TravelDistance_Mi) &
(TravelDistance_Mi < TD_Mi_SSHG_q5 |
TravelDistance_Mi > TD_Mi_SSHG_q95
) &
TD_Mi_SS_Cnt_F < 20 &
TD_Mi_SS_Cnt >= 20,
"TD_Mi_SS_Mean",
ifelse(is.na(TravelDistance_Mi) &
BusDay_EventNum != 1 &
TravelDistance_Mi_Hvrs != 0,
"TravelDistance_Mi_Hvrs",
ifelse(is.na(TravelDistance_Mi) &
BusDay_EventNum != 1 &
TravelDistance_Mi_Hvrs == 0,
"TD_Mi_SS_Mean",
"TravelDistance_Mi"
)))))
),
TravelDistance_Mi_NewHvrs = ifelse(!is.na(TravelDistance_Mi_Hvrs) &
TravelDistance_Mi_Hvrs != 0 &
(TravelDistance_Mi_New < TD_Mi_q2 |
TravelDistance_Mi_New > TD_Mi_q98
),
TravelDistance_Mi_Hvrs,
TravelDistance_Mi_New
),
TravelDistance_Mi_NewHvrs_Label =
factor(ifelse(!is.na(TravelDistance_Mi_Hvrs) &
TravelDistance_Mi_Hvrs != 0 &
(TravelDistance_Mi_New < TD_Mi_q2 |
TravelDistance_Mi_New > TD_Mi_q98
),
"TravelDistance_Mi_Hvrs",
as.character(TravelDistance_Mi_New_Label)
)
),
SpeedAvg_Mph_NewHvrs = TravelDistance_Mi_NewHvrs / TravelTime_Hr
)
rm(AllDays_NewOrder)
str(AllDays_NewTravelDist)
'data.frame': 2809529 obs. of 125 variables:
$ RowNum_OG : int 1 3 4 5 6 7 9 10 11 12 ...
$ UniqueLatLng : chr "38.767807__-77.155136" "38.769363__-77.157082" "38.769341__-77.155136" "38.766953__-77.155113" ...
$ group : Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
$ StartStop_ID : chr "NULL--5004572" "5004572--5004573" "5004573--5002210" "5002210--5002209" ...
$ BusDay_EventNum : int 1 2 3 4 5 6 7 8 9 10 ...
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : chr "S80" "S80" "S80" "S80" ...
$ RteChange2 : Factor w/ 2 levels "Change","Same": 1 2 2 2 2 2 2 2 2 2 ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 1 1 1 1 6 6 6 6 ...
$ DirChange2 : Factor w/ 2 levels "Change","Same": 1 2 2 2 2 2 1 2 2 2 ...
$ Route_Direction : Factor w/ 12 levels "","ANTICLKW",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Stop_Sequence : int 7 6 3 2 8 1 2 3 4 2 ...
$ Start_ID : chr NA "5004572" "5004573" "5002210" ...
$ Start_Desc : chr NA "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" ...
$ StopID_Clean : chr "5004572" "5004573" "5002210" "5002209" ...
$ StopID_Indicator : Factor w/ 2 levels "ID_Bad","ID_OK": 2 2 2 2 2 2 2 2 2 2 ...
$ Stop_Desc : chr "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" "BEULAH ST + CHARLES ARRINGTON DR" ...
$ countryCode : Factor w/ 1 level "US": 1 1 1 1 1 1 1 1 1 1 ...
$ Stop_State : Factor w/ 3 levels "DC","MD","VA": 3 3 3 3 3 3 3 3 3 3 ...
$ Stop_County : Factor w/ 11 levels "Anne Arundel",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Stop_City : Factor w/ 56 levels "Accokeek","Alexandria",..: 2 2 2 2 49 49 49 49 49 49 ...
$ Stop_Zip : Factor w/ 153 levels "20001","20002",..: 150 150 150 150 123 123 123 123 123 123 ...
$ Event_Type : int 4 4 4 4 3 3 4 4 4 4 ...
$ Event_Description : Factor w/ 3 levels "Serviced Stop ",..: 3 3 3 3 1 1 3 3 3 3 ...
$ Event_Time_Yr : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
$ Event_Time_Mth : int 10 10 10 10 10 10 10 10 10 10 ...
$ Event_Time_Date : int 3 3 3 3 3 3 3 3 3 3 ...
$ Event_Time_Day : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tues"<..: 2 2 2 2 2 2 2 2 2 2 ...
$ Event_Time_Hr : int 6 6 6 6 6 6 6 6 6 6 ...
$ Event_Time_HrGroup : Ord.factor w/ 8 levels "Group0_2"<"Group3_5"<..: 3 3 3 3 3 3 3 3 3 3 ...
$ Event_Time_Min : int 6 9 10 10 13 14 21 21 23 23 ...
$ Event_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Departure_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Dwell_Time : int 0 0 0 0 0 104 0 0 0 0 ...
$ Dwell_Time2 : num 0 0 0 0 0 104 0 0 0 0 ...
$ Delta_Time : int -177 24 165 25 73 719 74 76 63 69 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 199 97 276 15 119 100 274 104 241 274 ...
$ Odometer_Distance : int 43543 45139 46418 50115 51074 51303 55633 56163 56285 57262 ...
$ Odometer_Distance_Lag1 : int NA 43543 45139 46418 50115 51074 51303 55633 56163 56285 ...
$ Odometer_Distance_Mi : num 8.25 8.55 8.79 9.49 9.67 ...
$ TravelDistance_Ft : int NA 1596 1279 3697 959 229 4330 530 122 977 ...
$ TravelDistance_Mi : num NA 0.302 0.242 0.7 0.182 ...
$ TravelDistance_Mi_Hvrs : num NA 0.15 0.105 0.165 0.832 ...
$ TD_Mi_q2 : num 0.0521 0.0521 0.0521 0.0521 0.0521 ...
$ TD_Mi_q98 : num 0.959 0.959 0.959 0.959 0.959 ...
$ TD_Mi_SS_q5 : num NA 0.0252 0.2422 0.7324 0.0794 ...
$ TD_Mi_SS_q95 : num NA 0.626 0.242 1.008 0.176 ...
$ TD_Mi_SSHG_q5 : num NA 0.0996 0.2422 0.7002 0.1816 ...
$ TD_Mi_SSHG_q95 : num NA 0.627 0.242 0.7 0.182 ...
$ TD_Mi_Mean : num 0.308 0.308 0.308 0.308 0.308 ...
$ TD_Mi_Mean_F : num 0.232 0.232 0.232 0.232 0.232 ...
$ TD_Mi_SS_Mean : num NaN 0.437 0.242 0.908 0.128 ...
$ TD_Mi_SS_Mean_F : num NaN 0.457 0.242 0.977 NaN ...
$ TD_Mi_SSHG_Mean : num NaN 0.442 0.242 0.7 0.182 ...
$ TD_Mi_SSHG_Mean_F : num NaN 0.491 0.242 0.7 0.182 ...
$ TD_Mi_Med : num 0.198 0.198 0.198 0.198 0.198 ...
$ TD_Mi_Med_F : num 0.198 0.198 0.198 0.198 0.198 ...
$ TD_Mi_SS_Med : num NA 0.512 0.242 0.962 0.128 ...
$ TD_Mi_SS_Med_F : num NA 0.512 0.242 1.008 NA ...
$ TD_Mi_SSHG_Med : num NA 0.512 0.242 0.7 0.182 ...
$ TD_Mi_SSHG_Med_F : num NA 0.512 0.242 0.7 0.182 ...
$ TD_Mi_Cnt : int 2486795 2486795 2486795 2486795 2486795 2486795 2486795 2486795 2486795 2486795 ...
$ TD_Mi_Cnt_F : int 2387406 2387406 2387406 2387406 2387406 2387406 2387406 2387406 2387406 2387406 ...
$ TD_Mi_SS_Cnt : int 0 14 1 4 2 87 22 118 91 11 ...
$ TD_Mi_SS_Cnt_F : int 0 12 1 3 0 77 18 106 81 9 ...
$ TD_Mi_SSHG_Cnt : int 0 7 1 1 1 23 6 29 28 3 ...
$ TD_Mi_SSHG_Cnt_F : int 0 5 1 1 1 19 4 25 24 1 ...
$ TravelTime_Sec : num NA 180 37 25 190 29 288 52 76 8 ...
$ TT_Sec_q2 : num 10 10 10 10 10 10 10 10 10 10 ...
$ TT_Sec_q98 : num 349 349 349 349 349 349 349 349 349 349 ...
$ TT_Sec_SS_q5 : num NA 11.9 37 30.5 172.9 ...
$ TT_Sec_SS_q95 : num NA 346.3 37 75.8 189.1 ...
$ TT_Sec_SSHG_q5 : num NA 59.6 37 25 190 11.6 236 51.5 55 8.8 ...
$ TT_Sec_SSHG_q95 : num NA 276 37 25 190 ...
$ TT_Sec_Mean : num 105 105 105 105 105 ...
$ TT_Sec_Mean_F : num 56.6 56.6 56.6 56.6 56.6 ...
$ TT_Sec_SS_Mean : num NaN 215.8 37 58.2 181 ...
$ TT_Sec_SS_Mean_F : num NaN 218.9 37 65.5 NaN ...
$ TT_Sec_SSHG_Mean : num NaN 202 37 25 190 ...
$ TT_Sec_SSHG_Mean_F : num NaN 226 37 25 190 ...
$ TT_Sec_Med : num 39 39 39 39 39 39 39 39 39 39 ...
$ TT_Sec_Med_F : num 39 39 39 39 39 39 39 39 39 39 ...
$ TT_Sec_SS_Med : num NA 223.5 37 65.5 181 ...
$ TT_Sec_SS_Med_F : num NA 223.5 37 65.5 NA ...
$ TT_Sec_SSHG_Med : num NA 219 37 25 190 134 286 60 65 16 ...
$ TT_Sec_SSHG_Med_F : num NA 219 37 25 190 134 286 60 65 16 ...
$ TT_Sec_Cnt : int 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 2802888 ...
$ TT_Sec_Cnt_F : int 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 2705189 ...
$ TT_Sec_SS_Cnt : int 0 14 1 4 2 173 22 141 141 11 ...
$ TT_Sec_SS_Cnt_F : int 0 12 1 2 0 156 18 127 128 9 ...
$ TT_Sec_SSHG_Cnt : int 0 7 1 1 1 35 6 36 35 3 ...
$ TT_Sec_SSHG_Cnt_F : int 0 5 1 1 1 31 4 32 32 1 ...
$ TravelTime_Hr : num NA 0.05 0.01028 0.00694 0.05278 ...
$ TT_Hr_q2 : num 0.00278 0.00278 0.00278 0.00278 0.00278 ...
$ TT_Hr_q98 : num 0.0969 0.0969 0.0969 0.0969 0.0969 ...
$ TT_Hr_SS_q5 : num NA 0.00331 0.01028 0.00849 0.04803 ...
$ TT_Hr_SS_q95 : num NA 0.0962 0.0103 0.0211 0.0525 ...
[list output truncated]
Investigation of TravelDistance_Mi & TravelDistance_Mi_Hvrs & TravelDistance_Mi_New.
Quick summary and then correlation calculation.
# 38 rows meet this criteria anymore -- appears to be the case when both the Lat Long calculations, and the TravelDistance calculations did not function properly.
View(filter(AllDays_NewTravelDist,
is.na(TravelDistance_Mi_New) &
BusDay_EventNum != 1
)
)
View(AllDays_NewTravelDist %>%
arrange(desc(TravelDistance_Mi_New)) %>%
head(500)
)
summary(select(AllDays_NewTravelDist,
TravelDistance_Mi,
TravelDistance_Mi_Hvrs,
TravelDistance_Mi_New,
TravelDistance_Mi_NewHvrs
)
)
TravelDistance_Mi TravelDistance_Mi_Hvrs TravelDistance_Mi_New
Min. : 0.0 Min. : 0.000 Min. : 0.000
1st Qu.: 0.1 1st Qu.: 0.106 1st Qu.: 0.141
Median : 0.2 Median : 0.142 Median : 0.199
Mean : 0.3 Mean : 0.201 Mean : 0.298
3rd Qu.: 0.3 3rd Qu.: 0.193 3rd Qu.: 0.276
Max. :250.7 Max. :24.407 Max. :250.656
NA's :322734 NA's :6528 NA's :6566
TravelDistance_Mi_NewHvrs
Min. : 0.000
1st Qu.: 0.142
Median : 0.199
Mean : 0.259
3rd Qu.: 0.276
Max. :36.236
NA's :6566
summary(select(filter(AllDays_NewTravelDist,
BusDay_EventNum != 1
),
TravelDistance_Mi,
TravelDistance_Mi_Hvrs,
TravelDistance_Mi_New,
TravelDistance_Mi_NewHvrs
)
)
TravelDistance_Mi TravelDistance_Mi_Hvrs TravelDistance_Mi_New
Min. : 0.00 Min. : 0.0000 Min. : 0.00019
1st Qu.: 0.13 1st Qu.: 0.1055 1st Qu.: 0.14072
Median : 0.20 Median : 0.1424 Median : 0.19867
Mean : 0.31 Mean : 0.2008 Mean : 0.29751
3rd Qu.: 0.29 3rd Qu.: 0.1935 3rd Qu.: 0.27633
Max. :250.66 Max. :24.4068 Max. :250.65606
NA's :316206 NA's :38
TravelDistance_Mi_NewHvrs
Min. : 0.00019
1st Qu.: 0.14205
Median : 0.19903
Mean : 0.25859
3rd Qu.: 0.27557
Max. :36.23636
NA's :38
cor(select(AllDays_NewTravelDist,
TravelDistance_Mi,
TravelDistance_Mi_Hvrs,
TravelDistance_Mi_New,
TravelDistance_Mi_NewHvrs
),
use = "pairwise.complete.obs"
)
TravelDistance_Mi TravelDistance_Mi_Hvrs
TravelDistance_Mi 1.0000000 0.5447660
TravelDistance_Mi_Hvrs 0.5447660 1.0000000
TravelDistance_Mi_New 0.9513379 0.5837182
TravelDistance_Mi_NewHvrs 0.6005944 0.9005277
TravelDistance_Mi_New TravelDistance_Mi_NewHvrs
TravelDistance_Mi 0.9513379 0.6005944
TravelDistance_Mi_Hvrs 0.5837182 0.9005277
TravelDistance_Mi_New 1.0000000 0.6346981
TravelDistance_Mi_NewHvrs 0.6346981 1.0000000
Investigation of TravelDistance_Mi_NewHvrs_Label & TravelDistance_Mi_NewHvrs_Label.
Show how the labels changed.
group_by(AllDays_NewTravelDist,
TravelDistance_Mi_New_Label,
TravelDistance_Mi_NewHvrs_Label
) %>%
summarise(CntNum = n(),
CntPct = format(CntNum / nrow(AllDays_NewTravelDist),
scientific = 9999
)
) %>%
arrange(desc(CntPct)
)
Investigation of TravelDistance_Mi & TravelDistance_Mi_Hvrs & TravelDistance_Mi_New.
Graphing the two methods of calculating TravelDistance_Mi.
First, let’s get create a function to plot the liner model equation.
lm_eqn <- function(df, y, x){
m <- lm(y ~ x, df)
l <- list(a = format(coef(m)[1], digits = 2),
b = format(abs(coef(m)[2]), digits = 2),
s1 = ifelse(test = coef(m)[2] > 0,
yes = "+",
no = "-"
),
r2 = format(summary(m)$r.squared,
digits = 3
)
)
eq <- substitute(italic(y) == a~~s1~~b %.% italic(x)*","~~italic(r)^2~"="~r2,
l
)
as.character(as.expression(eq)
)
}
Investigation of TravelDistance_Mi & TravelDistance_Mi_NewHvrs.
Scatter plot (using a 10% sample to making plotting time faster and to reduce un-needed data in the “same” splot).
set.seed(123456789)
AllDays_NewTravelDist_10Pct <- filter(AllDays_NewTravelDist,
!is.na(TravelDistance_Mi_NewHvrs) &
!is.na(TravelDistance_Mi)
) %>%
rename(DistMethod = TravelDistance_Mi_NewHvrs_Label) %>%
sample_frac(0.1)
TravDist_MiVsCalc <- ggplot(select(AllDays_NewTravelDist_10Pct,
TravelDistance_Mi_NewHvrs,
TravelDistance_Mi,
DistMethod
),
aes(x = TravelDistance_Mi,
y = TravelDistance_Mi_NewHvrs,
colour = DistMethod
)
) +
scale_colour_manual(values = c("red","blue", "green", "orange", "black")
) +
geom_point(shape = 1, alpha = 0.5) +
scale_shape(solid = FALSE) +
geom_smooth(method = "lm", colour = "blue") +
geom_abline(intercept = 0, slope = 1, colour = "red") +
coord_cartesian(xlim = c(0, 1.5), ylim = c(0, 1.5)
) +
scale_x_continuous(breaks = seq(0, 1.5, 0.25)
) +
scale_y_continuous(breaks = seq(0, 1.5, 0.25)
) +
theme(legend.position = "bottom", #c(0.85, 0.40),
legend.text = element_text(size = 6)
) +
annotate(label = lm_eqn(df = AllDays_NewTravelDist_10Pct,
x = AllDays_NewTravelDist_10Pct$TravelDistance_Mi,
y = AllDays_NewTravelDist_10Pct$TravelDistance_Mi_NewHvrs
),
# x = 62,
# y = 20,
x = 0.70,
y = 0.00,
geom = "text",
size = 3,
colour = "blue",
parse = TRUE
) +
annotate(label = "Reference Line (slope = 1)",
# x = 16,
# y = 30,
x = 0.80,
y = 1.05,
geom = "text",
size = 3,
colour = "red"
) +
labs(title = "TravelDistance_Mi vs. TravelDistance_Mi_NewHvrs",
x = "TravelDistance_Mi",
y = "TravelDistance_Mi_NewHvrs"
)
# +
# geom_jitter()
TravDist_MiVsCalc
Investigation of TravelDistance_Mi & TravelDistance_Mi_Hvrs & TravelDistance_Mi_New.
Graphing test with rbokeh.
TravDist_MiVsCalc_Bokeh <- figure(data = select(AllDays_NewTravelDist_10Pct,
TravelDistance_Mi_NewHvrs,
TravelDistance_Mi,
DistMethod
),
xlim = c(0, 1.5),
ylim = c(0, 1.5),
legend_location = "bottom_right"
) %>%
ly_points(x = TravelDistance_Mi,
y = TravelDistance_Mi_NewHvrs,
color = DistMethod,
hover = c(TravelDistance_Mi_NewHvrs, TravelDistance_Mi, DistMethod)
) %>%
ly_abline(a = 0, b = 1, color = "red")
TravDist_MiVsCalc_Bokeh
Investigation of TravelDistance_Mi_New.
Calculating the minimum TravelDistance_Mi_New value at each percentile.
Investigation of TravelDistance_Mi_NewHvrs
Calculating the minimum TravelDistance_Mi_NewHvrs value at each percentile.
TravDistMiH_Ntile <- as.data.frame(select(AllDays_NewTravelDist,
StartStop_ID,
# TravelDistance_Mi_New_Label,
TravelDistance_Mi_NewHvrs_Label,
# TravelDistance_Mi_New,
TravelDistance_Mi_NewHvrs
)
) %>%
mutate(# PctR_N = percent_rank(AllDays_NewTravelDist$TravelDistance_Mi_New),
PctR_H = percent_rank(AllDays_NewTravelDist$TravelDistance_Mi_NewHvrs),
# PctR_Round_N = round(PctR_N, 2),
PctR_Round_H = round(PctR_H, 2)
)
# str(TravDistMiH_Ntile)
# View(head(TravDistMiH_Ntile, 500))
TravDistMiH_Ntile_Rows <- nrow(TravDistMiH_Ntile)
# View(tail(TravDistMiH_Ntile, 500))
TravDistMiH_Pctiles <- group_by(TravDistMiH_Ntile,
PctR_Round_H
) %>%
summarise(
# MinTDMiAtPctile_N = min(TravelDistance_Mi_New),
MinTDMiAtPctile_H = min(TravelDistance_Mi_NewHvrs),
# CntsAtPctile_N = sum(!is.na(TravelDistance_Mi_New)),
CntsAtPctile_H = sum(!is.na(TravelDistance_Mi_NewHvrs)),
# PctsAtPctile_N = CntsAtPctile_N / TravDistMiH_Ntile_Rows,
PctsAtPctile_H = CntsAtPctile_H / TravDistMiH_Ntile_Rows
) %>%
mutate(# CumSumPAtP_N = cumsum(PctsAtPctile_N),
CumSumPAtP_H = cumsum(PctsAtPctile_H)
)
# View(TravDistMiH_Pctiles)
TravDistMiH_Pctiles
Join TravDistMiH_Pctiles, TravDistMiN_Pctiles, and TravDistMi_Pctiles.
~11% of rides are still showing as less than 0.1 miles of TravelDistance_Mi_NewHvrs.
Investigation of TravelDistance_Mi_New.
Why are there still some small or large TravelDistance_Mi_NewHvrs values.
Investigation of TravelTime_Hr.
View(TravDistMi_Pctiles): 98% of TravelTime_Hr are between 7 seconds and 464 seconds (~8 minutes).
Investigation of TravelTime_Hr.
Histogram of TravelTime_Sec.
TravTime_Sec_HistDen <- ggplot(filter(select(AllDays_NewTravelDist,
TravelTime_Sec
),
!is.na(TravelTime_Sec)
),
aes(x = TravelTime_Sec,
y = ..density..
)
) +
geom_histogram(binwidth = 5, fill = "lightblue", colour = "grey60", size = 0.2) +
geom_line(stat = "density", colour = "red") +
# stat_bin(binwidth = 5,
# geom = "text",
# size = 2.5,
# vjust = 1.5,
# aes(label = format(..count.., big.mark = ",")
# ),
# ) +
coord_cartesian(xlim = c(0, 180), ylim = c(0, 0.02)
) +
# theme(legend.position="none") +
labs(title = "Variation in Travel Time",
x = "Travel Time (sec)",
y = "Density"
)
TravTime_Sec_HistDen
Investigation of TravelTime_Sec.
TravelTime_Sec values are NA.
Investigation of TravelTime_Sec.
TravelTime_Sec values are extremely small.
Investigation of TravelTime_Sec.
TravelTime_Sec values are extremely large.
Investigation of TravelTime_Sec.
Are large TravelTime_Sec values related to RouteChanges? Looks likely. When the Bus involves a Route “change”, there is almost twice as likely to be a case of an outlier TravelTime_Sec value (on the high side).
prop.table(as.table(as.matrix(TTLargeRteChng_Spread)
),
1
)
Normal Outlier <NA>
Change 2.583712e-01 4.669780e-01 2.746508e-01
Same 9.889061e-01 1.105373e-02 4.020451e-05
prop.table(as.table(as.matrix(TTLargeRteChng_Spread)
),
2
)
Normal Outlier <NA>
Change 0.002224561 0.264978279 0.983135070
Same 0.997775439 0.735021721 0.016864930
Investigation of TravelTime_Sec.
Are large TravelTime_Sec values related to RouteChanges? Looks likely.
Investigation of TravelTime_Sec.
If TravelTime_Sec is below the 5th percentile for that StartStop_ID, or if TravelTime_Sec is above the 95th percentile for that StartStop_ID, consider this an outlier. In this case, replace the value with the mean for that StartStop_ID and HourGroup (TT_Sec_SSHG_Mean_F), or if there are not enough values at the HourGroup level, replace it with the mean for that StartStop_ID.
str(select(NewTravTime,
TravelTime_Sec,
TT_Sec_New,
TT_Sec_New_Label,
TT_Hr_New
)
)
'data.frame': 2809529 obs. of 4 variables:
$ TravelTime_Sec : num NA 180 37 25 190 29 288 52 76 8 ...
$ TT_Sec_New : num NA 180 37 25 190 29 288 52 76 8 ...
$ TT_Sec_New_Label: Factor w/ 4 levels "TravelTime_Sec",..: 1 1 1 1 1 1 1 1 1 1 ...
$ TT_Hr_New : num NA 0.05 0.01028 0.00694 0.05278 ...
summary(select(NewTravTime,
TravelTime_Sec,
TT_Sec_New,
TT_Sec_New_Label,
TT_Hr_New
)
)
TravelTime_Sec TT_Sec_New TT_Sec_New_Label TT_Hr_New
Min. : 1.0 Min. : 1.00 TravelTime_Sec :2503794 Min. : 0.000
1st Qu.: 25.0 1st Qu.: 25.00 TT_Sec_SS_Mean : 804 1st Qu.: 0.007
Median : 39.0 Median : 39.00 TT_Sec_SS_Mean_F : 52426 Median : 0.011
Mean : 104.9 Mean : 91.31 TT_Sec_SSHG_Mean_F: 249928 Mean : 0.025
3rd Qu.: 72.0 3rd Qu.: 70.00 NA's : 2577 3rd Qu.: 0.019
Max. :60750.0 Max. :60750.00 Max. :16.875
NA's :6641 NA's :9218 NA's :9218
Test investigation of just the X2 Route. Box plots for time between bus arrivals (by HourGroup).
Test investigation of just the X2 Route. Violin plots for time between bus arrivals (by Hour Group).
TimeBtwEvents_X2_ViolinPlot <- ggplot(select(as.data.frame(X2_ByStop),
TimeToEvent_Min,
Event_Time_HrGroup
),
aes(factor(Event_Time_HrGroup),
TimeToEvent_Min,
fill = factor(Event_Time_HrGroup)
)
) +
geom_violin(draw_quantiles = c(0.25, 0.5, 0.75),
trim = TRUE,
scale = "count",
na.rm = TRUE,
show.legend = NA,
inherit.aes = TRUE
) +
geom_text(data = Count_Values,
aes(y = Value_Counts,
label = format(round(Value_Counts, digits = 1),
nsmall = 1
)
),
size = 2.5,
vjust = -0.5
) +
theme(legend.position="none", axis.text.x = element_text(angle=45)) +
coord_cartesian(# xlim = c(0, 180),
ylim = c(0, 80)
) +
labs(title = "How Often an X2 Arrives at a Given Stop",
x = "Hour Group",
y = "Time Between Busses (min)"
)
TimeBtwEvents_X2_ViolinPlot
Test investigation of just the X2 Route. Box plots for time between bus arrivals (by Zip Code).
# Count_Values is needed to display the medians on the box plots
Count_Values_z <- ddply(as.data.frame(X2_ByStop),
.(Stop_Zip),
summarise,
Value_Counts = median(TimeToEvent_Min, na.rm = TRUE)
)
TimeBtwEvents_X2_BoxPlot_z <- ggplot(select(as.data.frame(X2_ByStop),
TimeToEvent_Min,
Stop_Zip
),
aes(factor(Stop_Zip),
TimeToEvent_Min,
fill = factor(Stop_Zip)
)
) +
geom_boxplot(outlier.colour="red", notch=TRUE, na.rm = TRUE) +
geom_text(data = Count_Values_z,
aes(y = Value_Counts,
label = format(round(Value_Counts, digits = 1),
nsmall = 1
)
),
size = 3,
vjust = -0.5
) +
theme(legend.position="none", axis.text.x = element_text(angle=45)) +
coord_cartesian(# xlim = c(0, 180),
ylim = c(0, 100)
) +
labs(title = "How Often an X2 Arrives at a Given Stop",
x = "Zip Code of Destination",
y = "Time Between Busses (min)"
)
TimeBtwEvents_X2_BoxPlot_z
Test investigation of just the X2 Route. Violin plots for time between bus arrivals (by Zip Code).
TimeBtwEvents_X2_ViolinPlot_z <- ggplot(select(as.data.frame(X2_ByStop),
TimeToEvent_Min,
Stop_Zip
),
aes(factor(Stop_Zip),
TimeToEvent_Min,
fill = factor(Stop_Zip)
)
) +
geom_violin(draw_quantiles = c(0.25, 0.5, 0.75),
trim = TRUE,
scale = "count",
na.rm = TRUE,
show.legend = NA,
inherit.aes = TRUE
) +
geom_text(data = Count_Values_z,
aes(y = Value_Counts,
label = format(round(Value_Counts, digits = 1),
nsmall = 1
)
),
size = 2.5,
vjust = -0.5
) +
theme(legend.position="none", axis.text.x = element_text(angle=45)) +
coord_cartesian(# xlim = c(0, 180),
ylim = c(0, 60)
) +
labs(title = "How Often an X2 Arrives at a Given Stop",
x = "Zip Code of Destination",
y = "Time Between Busses (min)"
)
TimeBtwEvents_X2_ViolinPlot_z
Waiting time analyses.
Munging and sampling data to go from time beteen buses to “average” waiting time.
First, get the max and min times of bus stops (each day, and for each route).
Waiting time analyses.
Munging and sampling data to go from time beteen buses to “average” waiting time.
(Pulls here are done by day, as the data are too large to do at once.)
str(WaitData_DayPull)
'data.frame': 2666526 obs. of 23 variables:
$ RowNum_OG : int 771269 510393 842137 416282 403679 478483 842251 403790 842364 403906 ...
$ Route : chr "10A" "10A" "10A" "10A" ...
$ Event_Time_Date : int 3 3 3 3 3 3 3 3 3 3 ...
$ StopID_Clean : chr "2" "2" "2" "2" ...
$ Event_Type : int 3 4 3 3 4 3 3 4 3 4 ...
$ Event_Description : Factor w/ 3 levels "Serviced Stop ",..: 1 3 1 1 3 1 1 3 1 3 ...
$ Event_Time_Yr : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
$ Event_Time_Mth : int 10 10 10 10 10 10 10 10 10 10 ...
$ Event_Time_Day : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tues"<..: 2 2 2 2 2 2 2 2 2 2 ...
$ Event_Time_Hr : int 0 1 5 5 6 6 7 8 9 10 ...
$ Event_Time_HrGroup: Ord.factor w/ 8 levels "Group0_2"<"Group3_5"<..: 1 1 2 2 3 3 3 3 4 4 ...
$ Event_Time_Min : int 1 3 5 38 21 40 12 36 21 27 ...
$ Event_Time : POSIXct, format: "2016-10-03 00:01:53" "2016-10-03 01:03:51" ...
$ MinTime : POSIXct, format: "2016-10-03 00:00:19" "2016-10-03 00:00:19" ...
$ MaxTime : POSIXct, format: "2016-10-03 23:57:27" "2016-10-03 23:57:27" ...
$ SampTime : POSIXct, format: "2016-10-03 15:35:56" "2016-10-03 10:41:52" ...
$ NB : POSIXct, format: "2016-10-03 16:01:44" "2016-10-03 10:42:48" ...
$ WaitTime_Min : num 25.79 55.82 2.44 28.76 1.48 ...
$ WaitTime_Sec : num 1547.1 3349.2 146.6 1725.5 88.9 ...
$ WaitTime_Sec2 : num 25.79 55.82 2.44 28.76 1.48 ...
$ WaitTime_Min2 : num 0.4298 0.9303 0.0407 0.4793 0.0247 ...
$ WaitTime_Sec3 :Class 'difftime' atomic [1:2666526] 1547.1 55.8 8794.3 1725.5 5333.4 ...
.. ..- attr(*, "units")= chr "secs"
$ WaitTime_Min3 :Class 'difftime' atomic [1:2666526] 25.79 0.93 146.57 28.76 88.89 ...
.. ..- attr(*, "units")= chr "secs"
Waiting time analyses.
Munging and sampling data to go from time beteen buses to “average” waiting time.
Basic investigation of any missing rows from data pulled by day.
DistinctRowNum_OG <- distinct(select(WaitData_DayPull,
RowNum_OG
)
)
str(DistinctRowNum_OG)
'data.frame': 2666004 obs. of 1 variable:
$ RowNum_OG: int 771269 510393 842137 416282 403679 478483 842251 403790 842364 403906 ...
# View(
# anti_join(Samp,
# DistinctRowNum_OG,
# by = c("RowNum_OG" = "RowNum_OG")
# )
# )
# The samp time is AFTER the last bus passed that StopID_Clean
# View(filter(Samp,
# Event_Time > "2016-10-07 19:48:41" &
# Route == "X2" &
# StopID_Clean == 1003774
# )
# )
# Next Bus (NB) can be on the next morning
# View(filter(Testing7,
# SampTime > "2016-10-06 23:58:00" &
# SampTime < "2016-10-06 23:59:59")
# )
Waiting time analyses.
Munging and sampling data to go from time beteen buses to “average” waiting time.
(Pulls here are done by groupings of bus routes, as the data are too large to do at once.)
First, we need to find the most common bus routes.
rm(DistinctRowNum_OG)
# View(head(NewTravTime, 500))
set.seed(123456789)
BusGroups <- group_by(NewTravTime,
Route
) %>%
summarise(Cnt_Num = n(),
Cnt_Pct = Cnt_Num / nrow(NewTravTime)
) %>%
arrange(desc(Cnt_Num)
) %>%
mutate(RowNum = row_number(),
RandNum = runif(n = 268),
RouteGroup = ifelse(RandNum <= 0.2,
1,
ifelse(RandNum <= 0.4,
2,
ifelse(RandNum <= 0.6,
3,
ifelse(RandNum <= 0.8,
4,
5
))))
)
str(BusGroups)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 268 obs. of 6 variables:
$ Route : chr "70" "W4" "B2" "S2" ...
$ Cnt_Num : int 48269 47672 43173 42934 41462 38968 38566 37761 37718 36524 ...
$ Cnt_Pct : num 0.0172 0.017 0.0154 0.0153 0.0148 ...
$ RowNum : int 1 2 3 4 5 6 7 8 9 10 ...
$ RandNum : num 0.693 0.673 0.654 0.719 0.922 ...
$ RouteGroup: num 4 4 4 4 5 5 2 5 2 3 ...
View(BusGroups)
summary(BusGroups)
Route Cnt_Num Cnt_Pct RowNum
Length:268 Min. : 4 Min. :1.424e-06 Min. : 1.00
Class :character 1st Qu.: 2640 1st Qu.:9.396e-04 1st Qu.: 67.75
Mode :character Median : 7358 Median :2.619e-03 Median :134.50
Mean :10483 Mean :3.731e-03 Mean :134.50
3rd Qu.:17014 3rd Qu.:6.056e-03 3rd Qu.:201.25
Max. :48269 Max. :1.718e-02 Max. :268.00
RandNum RouteGroup
Min. :0.001084 Min. :1.000
1st Qu.:0.255701 1st Qu.:2.000
Median :0.512479 Median :3.000
Mean :0.501473 Mean :3.022
3rd Qu.:0.756575 3rd Qu.:4.000
Max. :0.997351 Max. :5.000
Waiting time analyses.
Munging and sampling data to go from time beteen buses to “average” waiting time.
(Pulls here are done by groupings of bus routes, as the data are too large to do at once.)
str(WaitData_RoutePull)
'data.frame': 2780848 obs. of 22 variables:
$ RowNum_OG : int 771269 510393 842137 416282 403679 478483 842251 403790 842364 403906 ...
$ Route : chr "10A" "10A" "10A" "10A" ...
$ RouteGroup : num 4 4 4 4 4 4 4 4 4 4 ...
$ Event_Time_Date : int 3 3 3 3 3 3 3 3 3 3 ...
$ StopID_Clean : chr "2" "2" "2" "2" ...
$ Event_Type : int 3 4 3 3 4 3 3 4 3 4 ...
$ Event_Description : Factor w/ 3 levels "Serviced Stop ",..: 1 3 1 1 3 1 1 3 1 3 ...
$ Event_Time_Yr : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
$ Event_Time_Mth : int 10 10 10 10 10 10 10 10 10 10 ...
$ Event_Time_Day : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tues"<..: 2 2 2 2 2 2 2 2 2 2 ...
$ Event_Time_Hr : int 0 1 5 5 6 6 7 8 9 10 ...
$ Event_Time_HrGroup: Ord.factor w/ 8 levels "Group0_2"<"Group3_5"<..: 1 1 2 2 3 3 3 3 4 4 ...
$ Event_Time_Min : int 1 3 5 38 21 40 12 36 21 27 ...
$ Event_Time : POSIXct, format: "2016-10-03 00:01:53" "2016-10-03 01:03:51" ...
$ MinTime : POSIXct, format: "2016-10-03 00:00:19" "2016-10-03 00:00:19" ...
$ MaxTime : POSIXct, format: "2016-10-03 23:57:27" "2016-10-03 23:57:27" ...
$ SampTime : POSIXct, format: "2016-10-03 08:00:38" "2016-10-03 02:41:02" ...
$ NB : POSIXct, format: "2016-10-03 08:36:07" "2016-10-03 05:05:41" ...
$ WaitTime_Min : num 35.47 2.41 22.98 4.94 7.99 ...
$ WaitTime_Sec : num 2128 145 1379 296 480 ...
$ WaitTime_Sec2 :Class 'difftime' atomic [1:2780848] 2128 8679 1379 296 480 ...
.. ..- attr(*, "units")= chr "secs"
$ WaitTime_Min2 :Class 'difftime' atomic [1:2780848] 35.47 144.65 22.98 4.94 7.99 ...
.. ..- attr(*, "units")= chr "secs"
Waiting time analyses.
Munging and sampling data to go from time beteen buses to “average” waiting time.
Compare WaitData pulled by day and pulled by route.
Waiting time analyses.
Munging and sampling data to go from time beteen buses to “average” waiting time.
Compare WaitData (pulled by route) and original data (NewTravTime).
str(Compare_NTT_WD) # 2,810,109 rows overall -- 29,261 rows with no match
'data.frame': 2810109 obs. of 63 variables:
$ RowNum_OG : int 771269 510393 842137 416282 403679 478483 842251 403790 842364 403906 ...
$ UniqueLatLng : chr "38.867313__-77.053574" "38.867313__-77.053574" "38.867313__-77.053574" "38.867313__-77.053574" ...
$ group : Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
$ StartStop_ID : chr "6000273--2" "6000273--2" "6000273--2" "6000273--2" ...
$ BusDay_EventNum : int 2 70 55 55 55 94 164 158 272 266 ...
$ Bus_ID : int 2915 2719 2950 2634 2625 2674 2950 2625 2950 2625 ...
$ Route : chr "10A" "10A" "10A" "10A" ...
$ RteChange2 : Factor w/ 2 levels "Change","Same": 2 2 2 2 2 2 2 2 2 2 ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 6 6 6 6 6 6 6 6 6 6 ...
$ DirChange2 : Factor w/ 2 levels "Change","Same": 2 2 2 2 2 2 2 2 2 2 ...
$ Route_Direction : Factor w/ 12 levels "","ANTICLKW",..: 7 7 7 7 7 7 7 7 7 7 ...
$ Stop_Sequence : int 55 55 55 55 55 55 55 55 55 55 ...
$ Start_ID : chr "6000273" "6000273" "6000273" "6000273" ...
$ Start_Desc : chr "ARMY-NAVY DR + S HAYES ST" "ARMY-NAVY DR + S HAYES ST" "ARMY-NAVY DR + S HAYES ST" "ARMY-NAVY DR + S HAYES ST" ...
$ StopID_Clean : chr "2" "2" "2" "2" ...
$ StopID_Indicator : Factor w/ 2 levels "ID_Bad","ID_OK": 1 1 1 1 1 1 1 1 1 1 ...
$ Stop_Desc : chr "PENTAGON INBOUND STOP" "PENTAGON INBOUND STOP" "PENTAGON INBOUND STOP" "PENTAGON INBOUND STOP" ...
$ countryCode : Factor w/ 1 level "US": 1 1 1 1 1 1 1 1 1 1 ...
$ Stop_State : Factor w/ 3 levels "DC","MD","VA": 3 3 3 3 3 3 3 3 3 3 ...
$ Stop_County : Factor w/ 11 levels "Anne Arundel",..: 2 2 2 2 2 2 2 2 2 2 ...
$ Stop_City : Factor w/ 56 levels "Accokeek","Alexandria",..: 4 4 4 4 4 4 4 4 4 4 ...
$ Stop_Zip : Factor w/ 153 levels "20001","20002",..: 132 132 132 132 132 132 132 132 132 132 ...
$ Event_Type : int 3 4 3 3 4 3 3 4 3 4 ...
$ Event_Description : Factor w/ 3 levels "Serviced Stop ",..: 1 3 1 1 3 1 1 3 1 3 ...
$ Event_Time_Yr : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
$ Event_Time_Mth : int 10 10 10 10 10 10 10 10 10 10 ...
$ Event_Time_Date : int 3 3 3 3 3 3 3 3 3 3 ...
$ Event_Time_Day : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tues"<..: 2 2 2 2 2 2 2 2 2 2 ...
$ Event_Time_Hr : int 0 1 5 5 6 6 7 8 9 10 ...
$ Event_Time_HrGroup : Ord.factor w/ 8 levels "Group0_2"<"Group3_5"<..: 1 1 2 2 3 3 3 3 4 4 ...
$ Event_Time_Min : int 1 3 5 38 21 40 12 36 21 27 ...
$ Event_Time : POSIXct, format: "2016-10-03 00:01:53" "2016-10-03 01:03:51" ...
$ Departure_Time : POSIXct, format: "2016-10-03 00:01:53" "2016-10-03 01:03:51" ...
$ Dwell_Time : int 0 0 0 0 0 0 0 0 0 0 ...
$ Dwell_Time2 : num 0 0 0 0 0 0 0 0 0 0 ...
$ Delta_Time : int -210 -89 -35 149 914 253 217 1267 400 900 ...
$ Latitude : num 38.9 38.9 38.9 38.9 38.9 ...
$ Longitude : num -77.1 -77.1 -77.1 -77.1 -77.1 ...
$ Heading : int 23 23 23 23 23 23 23 23 23 23 ...
$ Odometer_Distance : int 1131407 909311 87585 80914 88439 69784 211146 212739 336615 337781 ...
$ Odometer_Distance_Lag1 : int 1131407 908412 87585 80914 85325 69784 211146 211995 336615 337065 ...
$ Odometer_Distance_Mi : num 214.3 172.2 16.6 15.3 16.7 ...
$ TravelDistance_Ft : int NA 899 NA NA 3114 NA NA 744 NA 716 ...
$ TravelDistance_Mi : num NA 0.17 NA NA 0.59 ...
$ TravelDistance_Mi_Hvrs : num 0.322 0.322 0.322 0.322 0.319 ...
$ TravelTime_Sec : num 94 191 130 85 183 114 183 183 124 128 ...
$ TravelTime_Hr : num 0.0261 0.0531 0.0361 0.0236 0.0508 ...
$ SpeedAvg_Mph : num NA 3.21 NA NA 11.6 ...
$ TravelDistance_Mi_New : num 0.322 0.17 0.322 0.322 0.59 ...
$ TravelDistance_Mi_New_Label : Factor w/ 5 levels "TD_Mi_SS_Mean",..: 5 4 5 5 4 5 5 4 5 4 ...
$ TravelDistance_Mi_NewHvrs : num 0.322 0.17 0.322 0.322 0.59 ...
$ TravelDistance_Mi_NewHvrs_Label: Factor w/ 5 levels "TD_Mi_SS_Mean",..: 5 4 5 5 4 5 5 4 5 4 ...
$ SpeedAvg_Mph_NewHvrs : num 12.32 3.21 8.91 13.62 11.6 ...
$ TT_Sec_New : num 94 191 130 85 183 114 183 183 124 128 ...
$ TT_Sec_New_Label : Factor w/ 4 levels "TravelTime_Sec",..: 1 1 1 1 1 1 1 1 1 1 ...
$ TT_Hr_New : num 0.0261 0.0531 0.0361 0.0236 0.0508 ...
$ RouteGroup : num 4 4 4 4 4 4 4 4 4 4 ...
$ MinTime : POSIXct, format: "2016-10-03 00:00:19" "2016-10-03 00:00:19" ...
$ MaxTime : POSIXct, format: "2016-10-03 23:57:27" "2016-10-03 23:57:27" ...
$ SampTime : POSIXct, format: "2016-10-03 08:00:38" "2016-10-03 02:41:02" ...
$ NB : POSIXct, format: "2016-10-03 08:36:07" "2016-10-03 05:05:41" ...
$ WaitTime_Sec2 :Class 'difftime' atomic [1:2810109] 2128 8679 1379 296 480 ...
.. ..- attr(*, "units")= chr "secs"
$ WaitTime_Min2 :Class 'difftime' atomic [1:2810109] 35.47 144.65 22.98 4.94 7.99 ...
.. ..- attr(*, "units")= chr "secs"
Clean up the data a bit.
str(WaitTime_AsNum)
'data.frame': 2810109 obs. of 64 variables:
$ RowNum_OG : int 771269 510393 842137 416282 403679 478483 842251 403790 842364 403906 ...
$ UniqueLatLng : chr "38.867313__-77.053574" "38.867313__-77.053574" "38.867313__-77.053574" "38.867313__-77.053574" ...
$ group : Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
$ StartStop_ID : chr "6000273--2" "6000273--2" "6000273--2" "6000273--2" ...
$ BusDay_EventNum : int 2 70 55 55 55 94 164 158 272 266 ...
$ Bus_ID : int 2915 2719 2950 2634 2625 2674 2950 2625 2950 2625 ...
$ Route : chr "10A" "10A" "10A" "10A" ...
$ RteChange2 : Factor w/ 2 levels "Change","Same": 2 2 2 2 2 2 2 2 2 2 ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 6 6 6 6 6 6 6 6 6 6 ...
$ DirChange2 : Factor w/ 2 levels "Change","Same": 2 2 2 2 2 2 2 2 2 2 ...
$ Route_Direction : Factor w/ 12 levels "","ANTICLKW",..: 7 7 7 7 7 7 7 7 7 7 ...
$ Stop_Sequence : int 55 55 55 55 55 55 55 55 55 55 ...
$ Start_ID : chr "6000273" "6000273" "6000273" "6000273" ...
$ Start_Desc : chr "ARMY-NAVY DR + S HAYES ST" "ARMY-NAVY DR + S HAYES ST" "ARMY-NAVY DR + S HAYES ST" "ARMY-NAVY DR + S HAYES ST" ...
$ StopID_Clean : chr "2" "2" "2" "2" ...
$ StopID_Indicator : Factor w/ 2 levels "ID_Bad","ID_OK": 1 1 1 1 1 1 1 1 1 1 ...
$ Stop_Desc : chr "PENTAGON INBOUND STOP" "PENTAGON INBOUND STOP" "PENTAGON INBOUND STOP" "PENTAGON INBOUND STOP" ...
$ countryCode : Factor w/ 1 level "US": 1 1 1 1 1 1 1 1 1 1 ...
$ Stop_State : Factor w/ 3 levels "DC","MD","VA": 3 3 3 3 3 3 3 3 3 3 ...
$ Stop_County : Factor w/ 11 levels "Anne Arundel",..: 2 2 2 2 2 2 2 2 2 2 ...
$ Stop_City : Factor w/ 56 levels "Accokeek","Alexandria",..: 4 4 4 4 4 4 4 4 4 4 ...
$ Stop_Zip : Factor w/ 153 levels "20001","20002",..: 132 132 132 132 132 132 132 132 132 132 ...
$ Event_Type : int 3 4 3 3 4 3 3 4 3 4 ...
$ Event_Description : Factor w/ 3 levels "Serviced Stop ",..: 1 3 1 1 3 1 1 3 1 3 ...
$ Event_Time_Yr : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
$ Event_Time_Mth : int 10 10 10 10 10 10 10 10 10 10 ...
$ Event_Time_Date : int 3 3 3 3 3 3 3 3 3 3 ...
$ Event_Time_Day : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tues"<..: 2 2 2 2 2 2 2 2 2 2 ...
$ Event_Time_Hr : int 0 1 5 5 6 6 7 8 9 10 ...
$ Event_Time_HrGroup : Ord.factor w/ 8 levels "Group0_2"<"Group3_5"<..: 1 1 2 2 3 3 3 3 4 4 ...
$ Event_Time_Min : int 1 3 5 38 21 40 12 36 21 27 ...
$ Event_Time : POSIXct, format: "2016-10-03 00:01:53" "2016-10-03 01:03:51" ...
$ Departure_Time : POSIXct, format: "2016-10-03 00:01:53" "2016-10-03 01:03:51" ...
$ Dwell_Time : int 0 0 0 0 0 0 0 0 0 0 ...
$ Dwell_Time2 : num 0 0 0 0 0 0 0 0 0 0 ...
$ Delta_Time : int -210 -89 -35 149 914 253 217 1267 400 900 ...
$ Latitude : num 38.9 38.9 38.9 38.9 38.9 ...
$ Longitude : num -77.1 -77.1 -77.1 -77.1 -77.1 ...
$ Heading : int 23 23 23 23 23 23 23 23 23 23 ...
$ Odometer_Distance : int 1131407 909311 87585 80914 88439 69784 211146 212739 336615 337781 ...
$ Odometer_Distance_Lag1 : int 1131407 908412 87585 80914 85325 69784 211146 211995 336615 337065 ...
$ Odometer_Distance_Mi : num 214.3 172.2 16.6 15.3 16.7 ...
$ TravelDistance_Ft : int NA 899 NA NA 3114 NA NA 744 NA 716 ...
$ TravelDistance_Mi : num NA 0.17 NA NA 0.59 ...
$ TravelDistance_Mi_Hvrs : num 0.322 0.322 0.322 0.322 0.319 ...
$ TravelTime_Sec : num 94 191 130 85 183 114 183 183 124 128 ...
$ TravelTime_Hr : num 0.0261 0.0531 0.0361 0.0236 0.0508 ...
$ SpeedAvg_Mph : num NA 3.21 NA NA 11.6 ...
$ TravelDistance_Mi_New : num 0.322 0.17 0.322 0.322 0.59 ...
$ TravelDistance_Mi_New_Label : Factor w/ 5 levels "TD_Mi_SS_Mean",..: 5 4 5 5 4 5 5 4 5 4 ...
$ TravelDistance_Mi_NewHvrs : num 0.322 0.17 0.322 0.322 0.59 ...
$ TravelDistance_Mi_NewHvrs_Label: Factor w/ 5 levels "TD_Mi_SS_Mean",..: 5 4 5 5 4 5 5 4 5 4 ...
$ SpeedAvg_Mph_NewHvrs : num 12.32 3.21 8.91 13.62 11.6 ...
$ TT_Sec_New : num 94 191 130 85 183 114 183 183 124 128 ...
$ TT_Sec_New_Label : Factor w/ 4 levels "TravelTime_Sec",..: 1 1 1 1 1 1 1 1 1 1 ...
$ TT_Hr_New : num 0.0261 0.0531 0.0361 0.0236 0.0508 ...
$ RouteGroup : num 4 4 4 4 4 4 4 4 4 4 ...
$ MinTime : POSIXct, format: "2016-10-03 00:00:19" "2016-10-03 00:00:19" ...
$ MaxTime : POSIXct, format: "2016-10-03 23:57:27" "2016-10-03 23:57:27" ...
$ SampTime : POSIXct, format: "2016-10-03 08:00:38" "2016-10-03 02:41:02" ...
$ NB : POSIXct, format: "2016-10-03 08:36:07" "2016-10-03 05:05:41" ...
$ WaitTime_Sec2 : num 2128 8679 1379 296 480 ...
$ WaitTime_Min2 : num 35.47 144.65 22.98 4.94 7.99 ...
$ RouteStop_ID : Factor w/ 20897 levels "10A__2","10A__3",..: 1 1 1 1 1 1 1 1 1 1 ...
General exploration of wait times.
summary(WaitTime_AsNum$WaitTime_Min2)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.000 7.863 17.550 73.270 39.390 5154.000 29261
General exploration of wait times.
Looks like there might be an issue in wait times when very few Route-Stop combinations are included in the dataset. Let’s explore these.
Histogram of the counts of Route-StopID combinations.
RouteStop_Cnts_Bar <- ggplot(RouteStop_CntOfCnt,
aes(x = RouteStop_CntNum,
# y = ..density..
y = RouteStopCnt_CntNum
)
) +
# geom_histogram(binwidth = 5, fill = "lightblue", colour = "grey60", size = 0.2) +
geom_col(fill = "lightblue", colour = "grey60", size = 0.2) +
coord_cartesian(xlim = c(0, 500)
# ylim = c(0, 0.02)
) +
labs(title = "Variation in Routes Passing a Specific Stop",
x = "Occurrences of Route-StopID Combiantions",
y = "Counts"
)
RouteStop_Cnts_Bar
Create a new dataset limiting extremely small counts of Route-StopID combinations.
select(WaitTime_RteCnts,
WaitTime_Min2
) %>%
summary()
WaitTime_Min2
Min. : 0.000
1st Qu.: 7.863
Median : 17.554
Mean : 73.268
3rd Qu.: 39.394
Max. :5154.346
NA's :29261
filter(WaitTime_RteCnts,
RouteStop_CntNum > 60 # 12 passes per day in a 5-day dataset
) %>%
select(WaitTime_Min2) %>%
summary()
WaitTime_Min2
Min. : 0.000
1st Qu.: 7.477
Median : 16.512
Mean : 49.070
3rd Qu.: 33.572
Max. :2478.582
NA's :16298
filter(WaitTime_RteCnts,
WaitTime_Min2 < 180 # probably means that something went wrong
) %>%
select(WaitTime_Min2) %>%
summary()
WaitTime_Min2
Min. : 0.000
1st Qu.: 6.969
Median : 15.156
Mean : 25.022
3rd Qu.: 28.187
Max. :180.000
Compare quantiles in the limited datasets.
a <- as.data.frame(select(WaitTime_RteCnts,
WaitTime_Min2
) %>%
quantile(probs = seq(0, 1, 0.01), na.rm = TRUE)
)
b <- as.data.frame(filter(WaitTime_RteCnts,
RouteStop_CntNum > 60
) %>%
select(WaitTime_Min2) %>%
quantile(probs = seq(0, 1, 0.01), na.rm = TRUE)
)
c <- as.data.frame(filter(WaitTime_RteCnts,
WaitTime_Min2 < 180
) %>%
select(WaitTime_Min2) %>%
quantile(probs = seq(0, 1, 0.01), na.rm = TRUE)
)
WT_Filter_Quantiles <- bind_cols(a, b, c) %>%
mutate(Quantile = seq(0, 1, 0.01)
)
colnames(WT_Filter_Quantiles) <- c("All", "RteStpAbv60", "WTBlw180", "Quantile")
rm(a, b, c)
View(WT_Filter_Quantiles)
WT_Filter_Quantiles
Histogram of all wait times.
WaitTime_AllBus_HistDen <- ggplot(filter(select(WaitTime_RteCnts,
WaitTime_Min2
),
!is.na(WaitTime_Min2)
),
aes(x = WaitTime_Min2,
y = ..density..
)
) +
geom_histogram(binwidth = 5, fill = "lightblue", colour = "grey60", size = 0.2) +
geom_line(stat = "density", colour = "red") +
scale_x_continuous(breaks = seq(0, 300, 30)
) +
coord_cartesian(xlim = c(0, 300),
ylim = c(0, 0.035)
) +
labs(title = "Variation in Wait Time",
x = "Wait Time (min)",
y = "Density"
)
WaitTime_AllBus_HistDen
Box plots for WaitTime (all busses, by Zip Code).
# Count_Values is needed to display the medians on the box plots
BusRoute <- select(WaitTime_RteCnts,
Route,
WaitTime_Min2,
Stop_Zip
) %>%
filter(Route == "X2")
CountValues_AllBus_Zip <- ddply(BusRoute,
.(Stop_Zip),
summarise,
Value_Counts = median(WaitTime_Min2, na.rm = TRUE)
)
WaitTime_AllBus_Zip_Box <- ggplot(BusRoute,
aes(factor(Stop_Zip),
WaitTime_Min2,
fill = factor(Stop_Zip)
)
) +
geom_boxplot(outlier.colour="red", notch=TRUE, na.rm = TRUE) +
geom_text(data = CountValues_AllBus_Zip,
aes(y = Value_Counts,
label = format(round(Value_Counts, digits = 1),
nsmall = 1
)
),
size = 3,
vjust = -0.5
) +
theme(legend.position="none", axis.text.x = element_text(angle=45)) +
coord_cartesian(# xlim = c(0, 180),
ylim = c(0, 45)
) +
labs(title = "Waiting Time at a Given Stop (for the X2)",
x = "Zip Code of Destination",
y = "Waiting Time (min)"
)
WaitTime_AllBus_Zip_Box
Test investigation of just the X2 Route. Violin plots for time between bus arrivals (by Zip Code).
WaitTime_AllBus_Zip_Violin <- ggplot(BusRoute,
aes(factor(Stop_Zip),
WaitTime_Min2,
fill = factor(Stop_Zip)
)
) +
geom_violin(draw_quantiles = c(0.25, 0.5, 0.75),
trim = TRUE,
scale = "count",
na.rm = TRUE,
show.legend = NA,
inherit.aes = TRUE
) +
geom_text(data = CountValues_AllBus_Zip,
aes(y = Value_Counts,
label = format(round(Value_Counts, digits = 1),
nsmall = 1
)
),
size = 3.5,
vjust = -0.5
) +
theme(legend.position="none", axis.text.x = element_text(angle=45)) +
coord_cartesian(# xlim = c(0, 180),
ylim = c(0, 45)
) +
labs(title = "Waiting Time at a Given Stop (for the X2)",
x = "Zip Code of Destination",
y = "Waiting Time (min)"
)
TimeBtwEvents_X2_ViolinPlot_z
Box plots for WaitTime (Zip Code, by HourGroupZip).
# Count_Values is needed to display the medians on the box plots
Zip <- select(WaitTime_RteCnts,
Route,
WaitTime_Min2,
Stop_Zip,
Event_Time_HrGroup
) %>%
filter(Stop_Zip == 20002)
CountValues_AllBus_HG <- ddply(Zip,
.(Event_Time_HrGroup),
summarise,
Value_Counts = median(WaitTime_Min2,
na.rm = TRUE
)
)
WaitTime_AllBus_HG_Box <- ggplot(Zip,
aes(factor(Event_Time_HrGroup),
WaitTime_Min2,
fill = factor(Event_Time_HrGroup)
)
) +
geom_boxplot(outlier.colour="red", notch=TRUE, na.rm = TRUE) +
geom_text(data = CountValues_AllBus_HG,
aes(y = Value_Counts,
label = format(round(Value_Counts, digits = 1),
nsmall = 1
)
),
size = 2.5,
vjust = -0.5
) +
theme(legend.position="none", axis.text.x = element_text(angle=45)) +
coord_cartesian(# xlim = c(0, 180),
ylim = c(0, 45)
) +
labs(title = "Waiting Time at a Given Stop (for Zip 20002)",
x = "Hour Group",
y = "Waiting Time (min)"
)
# facet_wrap(~Stop_Zip
# # nrow = 5
# )
WaitTime_AllBus_HG_Box
Violin plots for WaitTime (Zip Code, by HourGroupZip).
WaitTime_AllBus_HG_Vln <- ggplot(Zip,
aes(factor(Event_Time_HrGroup),
WaitTime_Min2,
fill = factor(Event_Time_HrGroup)
)
) +
geom_violin(draw_quantiles = c(0.25, 0.5, 0.75),
trim = TRUE,
scale = "count",
na.rm = TRUE,
show.legend = NA,
inherit.aes = TRUE
) +
geom_text(data = CountValues_AllBus_HG,
aes(y = Value_Counts,
label = format(round(Value_Counts, digits = 1),
nsmall = 1
)
),
size = 2.5,
vjust = -0.5
) +
theme(legend.position="none", axis.text.x = element_text(angle=45)) +
coord_cartesian(# xlim = c(0, 180),
ylim = c(0, 90)
) +
labs(title = "Waiting Time at a Given Stop (for Zip 20002)",
x = "Hour Group",
y = "Waiting Time (min)"
)
# facet_wrap(~Stop_Zip
# # nrow = 5
# )
WaitTime_AllBus_HG_Vln
Box plots for WaitTime (Route, by HourGroupZip).
# Count_Values is needed to display the medians on the box plots
Rte <- select(WaitTime_RteCnts,
Route,
WaitTime_Min2,
Stop_Zip,
Event_Time_HrGroup
) %>%
filter(Route == "X2")
CountValues_AllBus_RteHG <- group_by(Rte,
Event_Time_HrGroup
) %>%
summarise(
Value_Counts = median(WaitTime_Min2,
na.rm = TRUE
),
VC = quantile(WaitTime_Min2, probs = 0.9, na.rm = TRUE)
)
WaitTime_AllBus_RteHG_Box <- ggplot(Rte,
aes(factor(Event_Time_HrGroup),
WaitTime_Min2,
fill = factor(Event_Time_HrGroup)
)
) +
geom_boxplot(outlier.colour="red", notch=TRUE, na.rm = TRUE) +
geom_text(data = CountValues_AllBus_RteHG,
aes(y = Value_Counts,
label = format(round(Value_Counts, digits = 1),
nsmall = 1
)
),
size = 2.5,
vjust = -0.5
) +
theme(legend.position="none", axis.text.x = element_text(angle=45)) +
coord_cartesian(# xlim = c(0, 180),
ylim = c(0, max(CountValues_AllBus_RteHG$VC))
) +
labs(title = "Waiting Time at a Given Stop",
subtitle = ("Route X2"),
x = "Hour Group",
y = "Waiting Time (min)"
)
# +
# facet_wrap(~Stop_Zip
# # nrow = 5
# )
WaitTime_AllBus_RteHG_Box
Violin plots for WaitTime (Zip Code, by HourGroupZip).
WaitTime_AllBus_RteHG_Vln <- ggplot(Rte,
aes(factor(Event_Time_HrGroup),
WaitTime_Min2,
fill = factor(Event_Time_HrGroup)
)
) +
geom_violin(draw_quantiles = c(0.25, 0.5, 0.75),
trim = TRUE,
scale = "count",
na.rm = TRUE,
show.legend = NA,
inherit.aes = TRUE
) +
geom_text(data = CountValues_AllBus_RteHG,
aes(y = Value_Counts,
label = format(round(Value_Counts, digits = 1),
nsmall = 1
)
),
size = 2.5,
vjust = -0.5
) +
theme(legend.position="none", axis.text.x = element_text(angle=45)) +
coord_cartesian(# xlim = c(0, 180),
ylim = c(0, 45)
) +
labs(title = "Waiting Time at a Given Stop",
subtitle = ("(Route X2)"),
x = "Hour Group",
y = "Waiting Time (min)"
) +
facet_wrap(~Stop_Zip
# nrow = 5
)
WaitTime_AllBus_RteHG_Vln
X2 Percentiles Line Graph Test.
GET DATA READY FOR SHINY – GET DATA READY FOR SHINY – GET DATA READY FOR SHINY GET DATA READY FOR SHINY – GET DATA READY FOR SHINY – GET DATA READY FOR SHINY GET DATA READY FOR SHINY – GET DATA READY FOR SHINY – GET DATA READY FOR SHINY
BaseData: Used in plots by hour and zipcode (first two Shiny tabs).
# str(WaitTime_RteCnts)
Shiny_WaitData_Base <- select(WaitTime_RteCnts,
Route,
Stop_Zip,
Event_Time,
Event_Time_Date,
Event_Time_Day,
Event_Time_HrGroup,
Event_Time_Hr,
Latitude,
Longitude,
WaitTime_Min2
) %>%
mutate(Event_Time_YrMthDayHr = floor_date(Event_Time, "hour")
) %>%
rename(ZipCode = Stop_Zip,
HourGroup = Event_Time_HrGroup,
Date = Event_Time_Date,
Day = Event_Time_Day,
Hour = Event_Time_Hr,
WaitTime_Min = WaitTime_Min2
) %>%
filter(WaitTime_Min <= 180)
Shiny_WaitData_Base$Route <- factor(Shiny_WaitData_Base$Route)
str(Shiny_WaitData_Base)
'data.frame': 2482606 obs. of 11 variables:
$ Route : Factor w/ 268 levels "10A","10B","10E",..: 1 1 1 1 1 1 1 1 1 1 ...
$ ZipCode : Factor w/ 153 levels "20001","20002",..: 132 132 132 132 132 132 132 132 132 132 ...
$ Event_Time : POSIXct, format: "2016-10-03 00:01:53" "2016-10-03 01:03:51" ...
$ Date : int 3 3 3 3 3 3 3 3 3 3 ...
$ Day : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tues"<..: 2 2 2 2 2 2 2 2 2 2 ...
$ HourGroup : Ord.factor w/ 8 levels "Group0_2"<"Group3_5"<..: 1 1 2 2 3 3 3 3 4 4 ...
$ Hour : int 0 1 5 5 6 6 7 8 9 10 ...
$ Latitude : num 38.9 38.9 38.9 38.9 38.9 ...
$ Longitude : num -77.1 -77.1 -77.1 -77.1 -77.1 ...
$ WaitTime_Min : num 35.47 144.65 22.98 4.94 7.99 ...
$ Event_Time_YrMthDayHr: POSIXct, format: "2016-10-03 00:00:00" "2016-10-03 01:00:00" ...
View(tail(Shiny_WaitData_Base, 500))
saveRDS(Shiny_WaitData_Base,
"Shiny_WaitData_Base"
)
Prep data for mapping.
ZipWaitTest <- filter(Shiny_WaitData_Base,
WaitTime_Min <= 180 &
!is.na(ZipCode)
) %>%
group_by(ZipCode,
Event_Time_YrMthDayHr
# Event_Time_Day,
# Event_Time_Hr
) %>%
summarise(Pct80 = quantile(WaitTime_Min, probs = 0.8, na.rm = TRUE)
) %>%
arrange(# Event_Time_Hr,
ZipCode,
Event_Time_YrMthDayHr
) %>%
as.data.frame() %>%
mutate(Event_Time_DateNew = floor_date(Event_Time_YrMthDayHr, "day"),
Event_Time_HrNew = hour(Event_Time_YrMthDayHr),
Pct80_Level = factor(ifelse(Pct80 < 10,
"Below 10",
ifelse(Pct80 < 20,
"Below 20",
ifelse(Pct80 < 30,
"Below 30",
ifelse(Pct80 < 40,
"Below 40",
ifelse(Pct80 < 50,
"Below 50",
ifelse(Pct80 < 60,
"Below 60",
"60 Plus"
)))))),
levels = c("Below 10", "Below 20", "Below 30",
"Below 40", "Below 50", "Below 60", "60 Plus"
),
ordered = TRUE
)
)
str(ZipWaitTest)
'data.frame': 14666 obs. of 6 variables:
$ ZipCode : Factor w/ 153 levels "20001","20002",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Event_Time_YrMthDayHr: POSIXct, format: "2016-10-03 00:00:00" "2016-10-03 01:00:00" ...
$ Pct80 : num 26.9 20.5 22.8 25.5 29.3 ...
$ Event_Time_DateNew : POSIXct, format: "2016-10-03" "2016-10-03" ...
$ Event_Time_HrNew : int 0 1 2 4 5 6 7 8 9 10 ...
$ Pct80_Level : Ord.factor w/ 7 levels "Below 10"<"Below 20"<..: 3 3 3 3 3 3 3 3 3 3 ...
ZipWaitTest$ZipCode <- as.character(ZipWaitTest$ZipCode)
str(ZipWaitTest)
'data.frame': 14666 obs. of 6 variables:
$ ZipCode : chr "20001" "20001" "20001" "20001" ...
$ Event_Time_YrMthDayHr: POSIXct, format: "2016-10-03 00:00:00" "2016-10-03 01:00:00" ...
$ Pct80 : num 26.9 20.5 22.8 25.5 29.3 ...
$ Event_Time_DateNew : POSIXct, format: "2016-10-03" "2016-10-03" ...
$ Event_Time_HrNew : int 0 1 2 4 5 6 7 8 9 10 ...
$ Pct80_Level : Ord.factor w/ 7 levels "Below 10"<"Below 20"<..: 3 3 3 3 3 3 3 3 3 3 ...
summary(ZipWaitTest)
ZipCode Event_Time_YrMthDayHr Pct80
Length:14666 Min. :2016-10-03 00:00:00 Min. : 0.1644
Class :character 1st Qu.:2016-10-04 08:00:00 1st Qu.: 26.9316
Mode :character Median :2016-10-05 13:00:00 Median : 32.6174
Mean :2016-10-05 12:42:39 Mean : 38.5860
3rd Qu.:2016-10-06 18:00:00 3rd Qu.: 43.3970
Max. :2016-10-07 23:00:00 Max. :177.7933
Event_Time_DateNew Event_Time_HrNew Pct80_Level
Min. :2016-10-03 00:00:00 Min. : 0.00 Below 10: 216
1st Qu.:2016-10-04 00:00:00 1st Qu.: 7.00 Below 20: 973
Median :2016-10-05 00:00:00 Median :13.00 Below 30:4587
Mean :2016-10-05 00:06:58 Mean :12.59 Below 40:4346
3rd Qu.:2016-10-06 00:00:00 3rd Qu.:18.00 Below 50:2085
Max. :2016-10-07 00:00:00 Max. :23.00 Below 60: 938
60 Plus :1521
View(head(ZipWaitTest, 500))
StopZip_Left <- left_join(ZipWaitTest,
ggtract,
by = c("ZipCode" = "id")
)
str(StopZip_Left)
'data.frame': 13326084 obs. of 12 variables:
$ ZipCode : chr "20001" "20001" "20001" "20001" ...
$ Event_Time_YrMthDayHr: POSIXct, format: "2016-10-03 00:00:00" "2016-10-03 00:00:00" ...
$ Pct80 : num 26.9 26.9 26.9 26.9 26.9 ...
$ Event_Time_DateNew : POSIXct, format: "2016-10-03" "2016-10-03" ...
$ Event_Time_HrNew : int 0 0 0 0 0 0 0 0 0 0 ...
$ Pct80_Level : Ord.factor w/ 7 levels "Below 10"<"Below 20"<..: 3 3 3 3 3 3 3 3 3 3 ...
$ long : num -77 -77 -77 -77 -77 ...
$ lat : num 38.9 38.9 38.9 38.9 38.9 ...
$ order : int 6215994 6215995 6215996 6215997 6215998 6215999 6216000 6216001 6216002 6216003 ...
$ hole : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ piece : Factor w/ 42 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
$ group : Factor w/ 43395 levels "00601.1","00602.1",..: 8990 8990 8990 8990 8990 8990 8990 8990 8990 8990 ...
summary(StopZip_Left)
ZipCode Event_Time_YrMthDayHr Pct80
Length:13326084 Min. :2016-10-03 00:00:00 Min. : 0.1644
Class :character 1st Qu.:2016-10-04 08:00:00 1st Qu.: 27.2793
Mode :character Median :2016-10-05 13:00:00 Median : 33.5260
Mean :2016-10-05 12:50:44 Mean : 39.6346
3rd Qu.:2016-10-06 18:00:00 3rd Qu.: 45.2414
Max. :2016-10-07 23:00:00 Max. :177.7933
Event_Time_DateNew Event_Time_HrNew Pct80_Level long
Min. :2016-10-03 00:00:00 Min. : 0.00 Below 10: 153887 Min. :-77.55
1st Qu.:2016-10-04 00:00:00 1st Qu.: 8.00 Below 20: 775558 1st Qu.:-77.16
Median :2016-10-05 00:00:00 Median :13.00 Below 30:4064708 Median :-77.06
Mean :2016-10-05 00:10:08 Mean :12.68 Below 40:3803555 Mean :-77.08
3rd Qu.:2016-10-06 00:00:00 3rd Qu.:18.00 Below 50:2073888 3rd Qu.:-76.99
Max. :2016-10-07 00:00:00 Max. :23.00 Below 60:1005385 Max. :-76.64
60 Plus :1449103
lat order hole piece
Min. :38.49 Min. :6215994 Mode :logical 1 :12835808
1st Qu.:38.84 1st Qu.:6352480 FALSE:13131418 2 : 298127
Median :38.91 Median :6464805 TRUE :194666 3 : 169063
Mean :38.91 Mean :6564079 NA's :0 4 : 8806
3rd Qu.:38.97 3rd Qu.:6907659 5 : 6426
Max. :39.23 Max. :6956170 6 : 3332
(Other): 4522
group
20744.1: 368712
22202.1: 343044
20166.1: 327275
20772.1: 301665
20854.1: 285854
20015.1: 266988
(Other):11432546
Test mapping functionaltiy.
Shiny data for mapping (used in 3rd tab).
View(head(filter(StopZip_Left,
Event_Time_HrNew == 15
),
500
)
)
Shiny_WaitData_Map <- StopZip_Left %>%
rename(YrMthDayHr = Event_Time_YrMthDayHr,
YrMthDay = Event_Time_DateNew,
Hour = Event_Time_HrNew
)
str(Shiny_WaitData_Map)
'data.frame': 13326084 obs. of 12 variables:
$ ZipCode : chr "20001" "20001" "20001" "20001" ...
$ YrMthDayHr : POSIXct, format: "2016-10-03 00:00:00" "2016-10-03 00:00:00" ...
$ Pct80 : num 26.9 26.9 26.9 26.9 26.9 ...
$ YrMthDay : POSIXct, format: "2016-10-03" "2016-10-03" ...
$ Hour : int 0 0 0 0 0 0 0 0 0 0 ...
$ Pct80_Level: Ord.factor w/ 7 levels "Below 10"<"Below 20"<..: 3 3 3 3 3 3 3 3 3 3 ...
$ long : num -77 -77 -77 -77 -77 ...
$ lat : num 38.9 38.9 38.9 38.9 38.9 ...
$ order : int 6215994 6215995 6215996 6215997 6215998 6215999 6216000 6216001 6216002 6216003 ...
$ hole : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ piece : Factor w/ 42 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
$ group : Factor w/ 43395 levels "00601.1","00602.1",..: 8990 8990 8990 8990 8990 8990 8990 8990 8990 8990 ...
Shiny_WaitData_Map_Wed <- filter(Shiny_WaitData_Map,
YrMthDay == as.POSIXct("2016-10-05")
)
str(Shiny_WaitData_Map_Wed)
'data.frame': 2670985 obs. of 12 variables:
$ ZipCode : chr "20001" "20001" "20001" "20001" ...
$ YrMthDayHr : POSIXct, format: "2016-10-05 00:00:00" "2016-10-05 00:00:00" ...
$ Pct80 : num 24.4 24.4 24.4 24.4 24.4 ...
$ YrMthDay : POSIXct, format: "2016-10-05" "2016-10-05" ...
$ Hour : int 0 0 0 0 0 0 0 0 0 0 ...
$ Pct80_Level: Ord.factor w/ 7 levels "Below 10"<"Below 20"<..: 3 3 3 3 3 3 3 3 3 3 ...
$ long : num -77 -77 -77 -77 -77 ...
$ lat : num 38.9 38.9 38.9 38.9 38.9 ...
$ order : int 6215994 6215995 6215996 6215997 6215998 6215999 6216000 6216001 6216002 6216003 ...
$ hole : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ piece : Factor w/ 42 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
$ group : Factor w/ 43395 levels "00601.1","00602.1",..: 8990 8990 8990 8990 8990 8990 8990 8990 8990 8990 ...
summary(Shiny_WaitData_Map_Wed)
ZipCode YrMthDayHr Pct80
Length:2670985 Min. :2016-10-05 00:00:00 Min. : 0.7088
Class :character 1st Qu.:2016-10-05 07:00:00 1st Qu.: 26.8931
Mode :character Median :2016-10-05 13:00:00 Median : 33.2470
Mean :2016-10-05 12:38:31 Mean : 38.7452
3rd Qu.:2016-10-05 18:00:00 3rd Qu.: 44.2554
Max. :2016-10-05 23:00:00 Max. :177.7933
YrMthDay Hour Pct80_Level long
Min. :2016-10-05 Min. : 0.00 Below 10: 26124 Min. :-77.55
1st Qu.:2016-10-05 1st Qu.: 7.00 Below 20:160309 1st Qu.:-77.16
Median :2016-10-05 Median :13.00 Below 30:868284 Median :-77.06
Mean :2016-10-05 Mean :12.64 Below 40:732820 Mean :-77.07
3rd Qu.:2016-10-05 3rd Qu.:18.00 Below 50:438313 3rd Qu.:-76.98
Max. :2016-10-05 Max. :23.00 Below 60:185217 Max. :-76.64
60 Plus :259918
lat order hole piece
Min. :38.49 Min. :6215994 Mode :logical 1 :2573195
1st Qu.:38.84 1st Qu.:6352542 FALSE:2632503 2 : 59324
Median :38.91 Median :6464799 TRUE :38482 3 : 33855
Mean :38.91 Mean :6564467 NA's :0 4 : 1731
3rd Qu.:38.97 3rd Qu.:6907853 5 : 1296
Max. :39.23 Max. :6956170 6 : 672
(Other): 912
group
20744.1: 75108
22202.1: 70368
20166.1: 65455
20772.1: 62985
20854.1: 54738
20015.1: 53866
(Other):2288465
saveRDS(Shiny_WaitData_Map,
"Shiny_WaitData_Map.rds"
)
saveRDS(Shiny_WaitData_Map_Wed,
"Shiny_WaitData_Map_Wed.rds"
)
Clustering
Data prep.
summary(RouteStats)
Route BusDayEventNum_Mean BusDayEventNum_Pct10 BusDayEventNum_Pct25
Length:268 Min. : 11.84 Min. : 2.00 Min. : 7.00
Class :character 1st Qu.:170.61 1st Qu.: 20.38 1st Qu.: 51.75
Mode :character Median :251.39 Median : 45.00 Median :106.12
Mean :245.25 Mean : 47.47 Mean :104.84
3rd Qu.:315.43 3rd Qu.: 63.25 3rd Qu.:145.00
Max. :524.23 Max. :410.20 Max. :444.50
BusDayEventNum_Pct50 BusDayEventNum_Pct75 BusDayEventNum_Pct90 StopSequence_Mean
Min. : 12.0 Min. : 17.0 Min. : 19.0 Min. : 1.948
1st Qu.:133.9 1st Qu.:252.5 1st Qu.:395.2 1st Qu.:13.827
Median :221.5 Median :375.9 Median :503.0 Median :20.877
Mean :216.2 Mean :359.0 Mean :489.3 Mean :21.494
3rd Qu.:290.5 3rd Qu.:460.2 3rd Qu.:613.8 3rd Qu.:28.037
Max. :653.0 Max. :761.2 Max. :934.0 Max. :49.067
StopSequence_Pct10 StopSequence_Pct25 StopSequence_Pct50 StopSequence_Pct75
Min. : 1.000 Min. : 1.00 Min. : 2.00 Min. : 2.00
1st Qu.: 4.000 1st Qu.: 7.00 1st Qu.:14.00 1st Qu.:20.00
Median : 5.000 Median :11.00 Median :21.00 Median :31.00
Mean : 5.374 Mean :11.41 Mean :21.34 Mean :31.49
3rd Qu.: 7.000 3rd Qu.:14.25 3rd Qu.:28.00 3rd Qu.:41.00
Max. :21.000 Max. :32.00 Max. :49.00 Max. :73.00
StopSequence_Pct90 EventTimeHr_Mean EventTimeHr_Pct10 EventTimeHr_Pct25
Min. : 2.00 Min. : 5.403 Min. : 0.00 Min. : 0.00
1st Qu.:23.60 1st Qu.:12.506 1st Qu.: 6.00 1st Qu.: 8.00
Median :37.00 Median :13.111 Median : 6.00 Median : 8.00
Mean :37.81 Mean :13.153 Mean : 6.54 Mean : 8.63
3rd Qu.:49.00 3rd Qu.:13.785 3rd Qu.: 7.00 3rd Qu.: 9.00
Max. :90.00 Max. :21.311 Max. :20.00 Max. :21.00
EventTimeHr_Pct50 EventTimeHr_Pct75 EventTimeHr_Pct90 DwellTime2_Mean
Min. : 1.00 Min. : 6.00 Min. : 6.00 Min. : 1.108
1st Qu.:13.00 1st Qu.:17.00 1st Qu.:18.00 1st Qu.: 3.815
Median :14.00 Median :18.00 Median :19.00 Median : 5.698
Mean :13.93 Mean :17.31 Mean :19.08 Mean : 7.757
3rd Qu.:16.00 3rd Qu.:18.00 3rd Qu.:21.00 3rd Qu.: 8.024
Max. :22.00 Max. :23.00 Max. :23.00 Max. :89.000
DwellTime2_Pct10 DwellTime2_Pct25 DwellTime2_Pct50 DwellTime2_Pct75
Min. : 0.00000 Min. : 0.000 Min. : 0.0000 Min. : 0.000
1st Qu.: 0.00000 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.: 1.000
Median : 0.00000 Median : 0.000 Median : 0.0000 Median : 3.000
Mean : 0.06642 Mean : 0.166 Mean : 0.7575 Mean : 4.824
3rd Qu.: 0.00000 3rd Qu.: 0.000 3rd Qu.: 0.0000 3rd Qu.: 6.000
Max. :17.80000 Max. :44.500 Max. :89.0000 Max. :133.500
DwellTime2_Pct90 TravDistMi_Mean TravDistMi_Pct10 TravDistMi_Pct25
Min. : 0.00 Min. :0.1508 Min. :0.03871 Min. :0.08197
1st Qu.: 7.00 1st Qu.:0.2281 1st Qu.:0.09640 1st Qu.:0.13414
Median : 11.00 Median :0.2652 Median :0.10114 Median :0.14630
Mean : 16.02 Mean :0.3816 Mean :0.12799 Mean :0.17532
3rd Qu.: 16.00 3rd Qu.:0.3800 3rd Qu.:0.11123 3rd Qu.:0.15909
Max. :372.00 Max. :3.4100 Max. :3.40238 Max. :3.40525
TravDistMi_Pct50 TravDistMi_Pct75 TravDistMi_Pct90 TravTimSec_Mean
Min. :0.1367 Min. :0.1680 Min. : 0.2118 Min. : 46.20
1st Qu.:0.1898 1st Qu.:0.2579 1st Qu.: 0.3378 1st Qu.: 72.68
Median :0.2068 Median :0.2821 Median : 0.3951 Median : 92.65
Mean :0.2576 Mean :0.3780 Mean : 0.6888 Mean : 168.49
3rd Qu.:0.2268 3rd Qu.:0.3281 3rd Qu.: 0.5138 3rd Qu.: 145.35
Max. :4.3944 Max. :4.4469 Max. :17.2302 Max. :2244.50
TravTimSec_Pct10 TravTimSec_Pct25 TravTimSec_Pct50 TravTimSec_Pct75
Min. : 2.257 Min. : 3.642 Min. : 5.224 Min. : 9.167
1st Qu.: 15.000 1st Qu.: 22.000 1st Qu.: 34.000 1st Qu.: 54.279
Median : 18.000 Median : 25.964 Median : 39.000 Median : 69.000
Mean : 28.370 Mean : 38.433 Mean : 59.459 Mean : 107.611
3rd Qu.: 22.050 3rd Qu.: 33.000 3rd Qu.: 50.250 3rd Qu.: 89.000
Max. :1368.600 Max. :1378.500 Max. :1395.000 Max. :1411.500
TravTimSec_Pct90 WaitTimMin_Mean WaitTimMin_Pct10 WaitTimMin_Pct25
Min. : 69.0 Min. : 4.97 Min. : 0.8875 Min. : 1.885
1st Qu.: 100.9 1st Qu.:23.27 1st Qu.: 2.4651 1st Qu.: 6.260
Median : 122.4 Median :30.13 Median : 3.2146 Median : 8.147
Mean : 220.6 Mean :31.97 Mean : 3.4880 Mean : 8.832
3rd Qu.: 160.4 3rd Qu.:40.30 3rd Qu.: 4.0958 3rd Qu.:10.334
Max. :5829.6 Max. :90.77 Max. :18.6955 Max. :45.485
WaitTimMin_Pct50 WaitTimMin_Pct75 WaitTimMin_Pct90
Min. : 3.932 Min. : 7.761 Min. : 10.98
1st Qu.: 13.419 1st Qu.: 25.719 1st Qu.: 49.28
Median : 16.995 Median : 35.557 Median : 68.02
Mean : 19.554 Mean : 42.097 Mean : 82.82
3rd Qu.: 21.938 3rd Qu.: 51.823 3rd Qu.:123.31
Max. :100.381 Max. :137.830 Max. :157.51
summary(RouteStats_Scaled)
BusDayEventNum_Mean BusDayEventNum_Pct10 BusDayEventNum_Pct25 BusDayEventNum_Pct50
Min. :-2.34900 Min. :-1.13383 Min. :-1.53337 Min. :-1.9165
1st Qu.:-0.75118 1st Qu.:-0.67566 1st Qu.:-0.83201 1st Qu.:-0.7729
Median : 0.06179 Median :-0.06164 Median : 0.02019 Median : 0.0493
Mean : 0.00000 Mean : 0.00000 Mean : 0.00000 Mean : 0.0000
3rd Qu.: 0.70626 3rd Qu.: 0.39342 3rd Qu.: 0.62947 3rd Qu.: 0.6967
Max. : 2.80761 Max. : 9.04453 Max. : 5.32346 Max. : 4.0982
BusDayEventNum_Pct75 BusDayEventNum_Pct90 StopSequence_Mean StopSequence_Pct10
Min. :-2.2718 Min. :-2.62837 Min. :-1.86495 Min. :-1.6772
1st Qu.:-0.7075 1st Qu.:-0.52580 1st Qu.:-0.73155 1st Qu.:-0.5269
Median : 0.1120 Median : 0.07633 Median :-0.05882 Median :-0.1435
Mean : 0.0000 Mean : 0.00000 Mean : 0.00000 Mean : 0.0000
3rd Qu.: 0.6725 3rd Qu.: 0.69522 3rd Qu.: 0.62429 3rd Qu.: 0.6233
Max. : 2.6719 Max. : 2.48485 Max. : 2.63089 Max. : 5.9912
StopSequence_Pct25 StopSequence_Pct50 StopSequence_Pct75 StopSequence_Pct90
Min. :-1.89738 Min. :-1.85293 Min. :-1.89086 Min. :-1.89466
1st Qu.:-0.80393 1st Qu.:-0.70321 1st Qu.:-0.73675 1st Qu.:-0.75174
Median :-0.07497 Median :-0.03253 Median :-0.03146 Median :-0.04271
Mean : 0.00000 Mean : 0.00000 Mean : 0.00000 Mean : 0.00000
3rd Qu.: 0.51731 3rd Qu.: 0.63814 3rd Qu.: 0.60971 3rd Qu.: 0.59225
Max. : 3.75209 Max. : 2.65016 Max. : 2.66146 Max. : 2.76168
EventTimeHr_Mean EventTimeHr_Pct10 EventTimeHr_Pct25 EventTimeHr_Pct50
Min. :-5.1596 Min. :-3.1516 Min. :-3.4927 Min. :-4.85426
1st Qu.:-0.4307 1st Qu.:-0.2604 1st Qu.:-0.2548 1st Qu.:-0.34753
Median :-0.0284 Median :-0.2604 Median :-0.2548 Median : 0.02803
Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
3rd Qu.: 0.4208 3rd Qu.: 0.2215 3rd Qu.: 0.1499 3rd Qu.: 0.77915
Max. : 5.4312 Max. : 6.4860 Max. : 5.0067 Max. : 3.03251
EventTimeHr_Pct75 EventTimeHr_Pct90 DwellTime2_Mean DwellTime2_Pct10
Min. :-5.6353 Min. :-5.49104 Min. :-0.7076 Min. :-0.06109
1st Qu.:-0.1543 1st Qu.:-0.45204 1st Qu.:-0.4195 1st Qu.:-0.06109
Median : 0.3440 Median :-0.03212 Median :-0.2191 Median :-0.06109
Mean : 0.0000 Mean : 0.00000 Mean : 0.0000 Mean : 0.00000
3rd Qu.: 0.3440 3rd Qu.: 0.80771 3rd Qu.: 0.0284 3rd Qu.:-0.06109
Max. : 2.8353 Max. : 1.64755 Max. : 8.6454 Max. :16.30962
DwellTime2_Pct25 DwellTime2_Pct50 DwellTime2_Pct75 DwellTime2_Pct90
Min. :-0.06109 Min. :-0.1351 Min. :-0.5087 Min. :-0.566526
1st Qu.:-0.06109 1st Qu.:-0.1351 1st Qu.:-0.4032 1st Qu.:-0.318965
Median :-0.06109 Median :-0.1351 Median :-0.1923 Median :-0.177502
Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.000000
3rd Qu.:-0.06109 3rd Qu.:-0.1351 3rd Qu.: 0.1241 3rd Qu.:-0.000673
Max. :16.30962 Max. :15.7425 Max. :13.5702 Max. :12.589555
TravDistMi_Mean TravDistMi_Pct10 TravDistMi_Pct25 TravDistMi_Pct50
Min. :-0.600624 Min. :-0.38582 Min. :-0.40457 Min. :-0.35144
1st Qu.:-0.399492 1st Qu.:-0.13650 1st Qu.:-0.17848 1st Qu.:-0.19720
Median :-0.302934 Median :-0.11603 Median :-0.12575 Median :-0.14764
Mean : 0.000000 Mean : 0.00000 Mean : 0.00000 Mean : 0.00000
3rd Qu.:-0.004291 3rd Qu.:-0.07243 3rd Qu.:-0.07032 3rd Qu.:-0.08973
Max. : 7.880641 Max. :14.15064 Max. :13.99809 Max. :12.01760
TravDistMi_Pct75 TravDistMi_Pct90 TravTimSec_Mean TravTimSec_Pct10
Min. :-0.4924 Min. :-0.3402 Min. :-0.51997 Min. :-0.27803
1st Qu.:-0.2817 1st Qu.:-0.2503 1st Qu.:-0.40741 1st Qu.:-0.14235
Median :-0.2249 Median :-0.2095 Median :-0.32246 Median :-0.11041
Mean : 0.0000 Mean : 0.0000 Mean : 0.00000 Mean : 0.00000
3rd Qu.:-0.1169 3rd Qu.:-0.1248 3rd Qu.:-0.09836 3rd Qu.:-0.06729
Max. : 9.5420 Max. :11.7980 Max. : 8.82760 Max. :14.26984
TravTimSec_Pct25 TravTimSec_Pct50 TravTimSec_Pct75 TravTimSec_Pct90
Min. :-0.35474 Min. :-0.47868 Min. :-0.6266 Min. :-0.3503
1st Qu.:-0.16755 1st Qu.:-0.22470 1st Qu.:-0.3394 1st Qu.:-0.2766
Median :-0.12714 Median :-0.18057 Median :-0.2458 Median :-0.2270
Mean : 0.00000 Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
3rd Qu.:-0.05539 3rd Qu.:-0.08128 3rd Qu.:-0.1185 3rd Qu.:-0.1391
Max. :13.66365 Max. :11.78754 Max. : 8.2991 Max. :12.9596
WaitTimMin_Mean WaitTimMin_Pct10 WaitTimMin_Pct25 WaitTimMin_Pct50
Min. :-1.9759 Min. :-1.4936 Min. :-1.5835 Min. :-1.3173
1st Qu.:-0.6368 1st Qu.:-0.5875 1st Qu.:-0.5864 1st Qu.:-0.5174
Median :-0.1345 Median :-0.1570 Median :-0.1561 Median :-0.2158
Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
3rd Qu.: 0.6101 3rd Qu.: 0.3491 3rd Qu.: 0.3423 3rd Qu.: 0.2010
Max. : 4.3041 Max. : 8.7343 Max. : 8.3550 Max. : 6.8156
WaitTimMin_Pct75 WaitTimMin_Pct90
Min. :-1.4325 Min. :-1.7329
1st Qu.:-0.6833 1st Qu.:-0.8090
Median :-0.2728 Median :-0.3571
Mean : 0.0000 Mean : 0.0000
3rd Qu.: 0.4058 3rd Qu.: 0.9767
Max. : 3.9941 Max. : 1.8017
PCA
Using caret::preProcess.
str(RouteStats_Pca)
'data.frame': 268 obs. of 15 variables:
$ PC1 : num -2.143 -3.5 0.996 2.215 -1.33 ...
$ PC2 : num -0.34 -0.708 2.136 2.78 2.115 ...
$ PC3 : num -0.2398 -0.0136 -0.1131 1.8714 0.3826 ...
$ PC4 : num -0.441 -0.741 -0.231 -1.692 0.286 ...
$ PC5 : num 0.455 0.137 -1.133 -0.39 -0.422 ...
$ PC6 : num -0.542 -0.651 0.384 0.335 0.856 ...
$ PC7 : num -0.248 -0.457 -1.26 -1.574 0.945 ...
$ PC8 : num 0.435 0.373 1.788 0.854 0.745 ...
$ PC9 : num -0.334 -0.845 0.458 -0.891 -0.619 ...
$ PC10: num 0.207 0.69 -1.068 -0.403 -0.095 ...
$ PC11: num -0.72 -0.321 0.904 0.47 0.458 ...
$ PC12: num -0.347 -0.6553 0.3871 -0.0855 0.0615 ...
$ PC13: num 0.1573 -0.0503 0.0605 0.4991 -0.2338 ...
$ PC14: num -0.0734 -0.1431 0.1695 0.0626 0.5568 ...
$ PC15: num -0.1296 -0.1389 -0.1039 -0.3495 -0.0226 ...
head(RouteStats_Pca)
PCA
Using stats::prcomp.
str(RouteStats)
'data.frame': 268 obs. of 43 variables:
$ Route : chr "10A" "10B" "10E" "11Y" ...
$ BusDayEventNum_Mean : num 272 297 161 157 229 ...
$ BusDayEventNum_Pct10: num 59 75 13.2 13 29 ...
$ BusDayEventNum_Pct25: num 134 154 34 36 119 78 141 109 38 139 ...
$ BusDayEventNum_Pct50: num 257 280 131 87 222 104 236 264 94 239 ...
$ BusDayEventNum_Pct75: num 387 415 265 228 328 ...
$ BusDayEventNum_Pct90: num 484 536 368 341 454 ...
$ StopSequence_Mean : num 28.1 35.1 23.4 25.8 22.8 ...
$ StopSequence_Pct10 : num 6 8 4 5 5 9 7 5 5 4 ...
$ StopSequence_Pct25 : num 15 18 12 12 12 20 16 10 10 8 ...
$ StopSequence_Pct50 : num 28 35 24 24 23 37 31 19 21 14 ...
$ StopSequence_Pct75 : num 42 52 35 40 34 52 46 28 32 21 ...
$ StopSequence_Pct90 : num 50 62 42 49 40 61 55 35 43.7 28 ...
$ EventTimeHr_Mean : num 13.1 13.2 13.7 14.7 13.4 ...
$ EventTimeHr_Pct10 : num 5 6 6 7 6 6 7 6 0 6 ...
$ EventTimeHr_Pct25 : num 8 9 7 8 8 7 9 7 0 9 ...
$ EventTimeHr_Pct50 : num 13 13 17 17 16 8 13 16 1 14 ...
$ EventTimeHr_Pct75 : num 18 18 18 18 18 8 18 21 23 18 ...
$ EventTimeHr_Pct90 : num 21 21 18 19 19 9 20 22 23 21 ...
$ DwellTime2_Mean : num 3.82 5.48 3.58 4.31 4.16 ...
$ DwellTime2_Pct10 : num 0 0 0 0 0 0 0 0 0 0 ...
$ DwellTime2_Pct25 : num 0 0 0 0 0 0 0 0 0 0 ...
$ DwellTime2_Pct50 : num 0 0 0 0 0 0 0 0 0 0 ...
$ DwellTime2_Pct75 : num 2 3 0 6 0 0 4 3 1 4 ...
$ DwellTime2_Pct90 : num 8 10 7 14 3 3 10 8 5 10 ...
$ TravDistMi_Mean : num 0.238 0.227 0.318 0.483 0.291 ...
$ TravDistMi_Pct10 : num 0.1061 0.1009 0.106 0.1071 0.0966 ...
$ TravDistMi_Pct25 : num 0.143 0.138 0.151 0.163 0.13 ...
$ TravDistMi_Pct50 : num 0.195 0.187 0.203 0.25 0.21 ...
$ TravDistMi_Pct75 : num 0.27 0.255 0.262 0.45 0.288 ...
$ TravDistMi_Pct90 : num 0.379 0.362 0.405 0.837 0.41 ...
$ TravTimSec_Mean : num 83.7 71.9 183.9 260.1 101.1 ...
$ TravTimSec_Pct10 : num 15 14 18 17 12 11 16 19 16 24 ...
$ TravTimSec_Pct25 : num 26 22 28 29 16 ...
$ TravTimSec_Pct50 : num 42 37 43 62 23 ...
$ TravTimSec_Pct75 : num 69 66.2 77 118 46 ...
$ TravTimSec_Pct90 : num 110 112 124 188 116 ...
$ WaitTimMin_Mean : num 26.6 26.1 43.4 45.9 40.6 ...
$ WaitTimMin_Pct10 : num 3.47 3.63 2.73 3.89 4.18 ...
$ WaitTimMin_Pct25 : num 8.74 9.03 6.42 9.6 10.78 ...
$ WaitTimMin_Pct50 : num 17.8 18.4 14.1 22.1 22 ...
$ WaitTimMin_Pct75 : num 28.5 28.9 73.3 68.7 45.3 ...
$ WaitTimMin_Pct90 : num 53.1 52.2 144.4 136.9 123.3 ...
PcaRes <- prcomp(select(RouteStats,
-Route
),
center = TRUE,
scale. = TRUE
)
str(PcaRes)
List of 5
$ sdev : num [1:42] 3.7 2.92 2.31 1.73 1.53 ...
$ rotation: num [1:42, 1:42] -0.0934 0.0337 -0.0481 -0.0841 -0.1032 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:42] "BusDayEventNum_Mean" "BusDayEventNum_Pct10" "BusDayEventNum_Pct25" "BusDayEventNum_Pct50" ...
.. ..$ : chr [1:42] "PC1" "PC2" "PC3" "PC4" ...
$ center : Named num [1:42] 245.2 47.5 104.8 216.2 359 ...
..- attr(*, "names")= chr [1:42] "BusDayEventNum_Mean" "BusDayEventNum_Pct10" "BusDayEventNum_Pct25" "BusDayEventNum_Pct50" ...
$ scale : Named num [1:42] 99.4 40.1 63.8 106.6 150.5 ...
..- attr(*, "names")= chr [1:42] "BusDayEventNum_Mean" "BusDayEventNum_Pct10" "BusDayEventNum_Pct25" "BusDayEventNum_Pct50" ...
$ x : num [1:268, 1:42] -1.5428 -2.1749 -0.4635 0.0989 -1.0434 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:268] "10A" "10B" "10E" "11Y" ...
.. ..$ : chr [1:42] "PC1" "PC2" "PC3" "PC4" ...
- attr(*, "class")= chr "prcomp"
head(unclass(PcaRes$rotation))
PC1 PC2 PC3 PC4 PC5
BusDayEventNum_Mean -0.09338810 -0.2730670 -0.01759169 -0.178805587 0.10602591
BusDayEventNum_Pct10 0.03370581 -0.2655559 0.06259492 0.009587185 -0.15407896
BusDayEventNum_Pct25 -0.04814982 -0.2865078 0.05176915 -0.072336986 -0.05437076
BusDayEventNum_Pct50 -0.08414405 -0.2745364 -0.01058914 -0.160976864 0.07875618
BusDayEventNum_Pct75 -0.10320703 -0.2415034 -0.03584771 -0.193510042 0.14584281
BusDayEventNum_Pct90 -0.10653834 -0.2223407 -0.04070840 -0.214169524 0.18258238
PC6 PC7 PC8 PC9 PC10
BusDayEventNum_Mean -0.0939145201 0.2275389 -0.03509260 0.085349993 -0.096147637
BusDayEventNum_Pct10 -0.0217740073 0.2446381 0.13095679 0.043360507 -0.005950166
BusDayEventNum_Pct25 0.0004701741 0.2390906 0.03248930 -0.009471207 -0.017292283
BusDayEventNum_Pct50 -0.0554379332 0.2234916 -0.08994723 0.033847958 -0.111576386
BusDayEventNum_Pct75 -0.1377039889 0.2150016 -0.05893128 0.121163990 -0.122631709
BusDayEventNum_Pct90 -0.1089639919 0.2000163 -0.02482000 0.110488185 -0.115543425
PC11 PC12 PC13 PC14
BusDayEventNum_Mean 0.062801021 -0.03603242 0.0008623936 -0.08280278
BusDayEventNum_Pct10 -0.046015784 0.29977626 -0.2332066150 0.52178884
BusDayEventNum_Pct25 -0.001958062 0.28744503 0.1556826973 0.35680391
BusDayEventNum_Pct50 0.062125441 -0.04358467 0.1745461088 0.01903697
BusDayEventNum_Pct75 0.102023389 -0.13985501 -0.0236560860 -0.21451196
BusDayEventNum_Pct90 0.053719704 -0.13385524 -0.1264852516 -0.29283825
PC15 PC16 PC17 PC18
BusDayEventNum_Mean 0.005052848 -0.01508907 0.009745823 0.010680226
BusDayEventNum_Pct10 0.060660196 0.13458006 -0.016134378 0.210562820
BusDayEventNum_Pct25 -0.143177884 0.06475744 0.096873687 -0.087000597
BusDayEventNum_Pct50 -0.048141311 0.13311402 -0.102884292 -0.348212611
BusDayEventNum_Pct75 -0.046739822 -0.04324681 -0.044670328 0.008981314
BusDayEventNum_Pct90 0.165416482 -0.20053696 0.127273115 0.335945988
PC19 PC20 PC21 PC22 PC23
BusDayEventNum_Mean 0.0001311624 0.01481802 -0.02010332 -0.008736911 0.01507617
BusDayEventNum_Pct10 0.1890747724 -0.14361466 0.32676688 0.298994027 0.25601598
BusDayEventNum_Pct25 -0.1119779304 -0.02693647 -0.37119644 -0.419722865 -0.36146370
BusDayEventNum_Pct50 -0.0798606326 0.14967773 -0.21733581 0.167905085 0.13496970
BusDayEventNum_Pct75 -0.0091188994 -0.05319933 0.04264386 0.077945878 0.28314788
BusDayEventNum_Pct90 0.0371772577 -0.05868440 0.17356562 -0.138934213 -0.22433912
PC24 PC25 PC26 PC27 PC28
BusDayEventNum_Mean 0.035524341 0.01120519 0.01145701 -0.01792224 -0.05433755
BusDayEventNum_Pct10 -0.001511485 0.06567621 -0.04280989 0.04234202 -0.05983503
BusDayEventNum_Pct25 -0.268740617 -0.03687138 0.01106133 0.03624843 0.11537293
BusDayEventNum_Pct50 0.467832249 -0.03563624 -0.05659897 -0.22924263 -0.27832184
BusDayEventNum_Pct75 -0.060158497 -0.16655286 0.29788147 0.18922670 0.52573534
BusDayEventNum_Pct90 -0.173530963 0.20837876 -0.20790238 0.01226040 -0.30881153
PC29 PC30 PC31 PC32 PC33
BusDayEventNum_Mean -0.027244823 -0.03406674 0.01164006 0.01939078 0.01947718
BusDayEventNum_Pct10 0.005149411 0.01270162 0.02074997 0.03403886 0.04113859
BusDayEventNum_Pct25 0.080478872 -0.01417978 -0.04301737 -0.05300308 -0.05209785
BusDayEventNum_Pct50 -0.243481810 0.08604567 0.01413502 0.01985978 0.06027302
BusDayEventNum_Pct75 0.156224102 -0.18879008 0.05609357 -0.03700094 -0.20358027
BusDayEventNum_Pct90 0.034396647 0.16745680 -0.09754177 0.04105972 0.15253964
PC34 PC35 PC36 PC37
BusDayEventNum_Mean 0.002353764 0.185633777 0.0014790395 0.20337749
BusDayEventNum_Pct10 0.012954051 -0.012131682 0.0059264541 -0.01199984
BusDayEventNum_Pct25 -0.018879547 -0.031598554 0.0001730291 -0.02891357
BusDayEventNum_Pct50 0.016046771 -0.081880339 -0.0125889072 -0.02732315
BusDayEventNum_Pct75 -0.011931747 0.003619082 0.0037123974 -0.08662220
BusDayEventNum_Pct90 -0.018446717 -0.076652688 0.0006734246 -0.04129042
PC38 PC39 PC40 PC41
BusDayEventNum_Mean -0.83310830 -0.0555982816 -0.017040143 -0.021911140
BusDayEventNum_Pct10 0.05102906 0.0014005972 0.001480221 -0.000404305
BusDayEventNum_Pct25 0.08830438 0.0050649048 0.005018847 0.001109075
BusDayEventNum_Pct50 0.25817484 0.0262306950 -0.008653629 0.013151507
BusDayEventNum_Pct75 0.22914052 0.0219838671 0.004533696 -0.001389741
BusDayEventNum_Pct90 0.26868985 -0.0004870527 0.018278493 0.010426258
PC42
BusDayEventNum_Mean 0.000000e+00
BusDayEventNum_Pct10 1.137111e-15
BusDayEventNum_Pct25 -1.240408e-15
BusDayEventNum_Pct50 -7.054637e-16
BusDayEventNum_Pct75 1.098402e-15
BusDayEventNum_Pct90 1.144457e-16
PcaRes_Vars <- get_pca_var(PcaRes)
PcaRes_Vars
Principal Component Analysis Results for variables
===================================================
Name Description
1 "$coord" "Coordinates for the variables"
2 "$cor" "Correlations between variables and dimensions"
3 "$cos2" "Cos2 for the variables"
4 "$contrib" "contributions of the variables"
# Where variables lie in relation to the eigenvectors
PcaRes_Vars$coord
Dim.1 Dim.2 Dim.3 Dim.4
BusDayEventNum_Mean -0.3453868585 -0.79864826 -0.040549393 -0.3101788419
BusDayEventNum_Pct10 0.1246576734 -0.77668029 0.144283261 0.0166311473
BusDayEventNum_Pct25 -0.1780774431 -0.83795908 0.119329510 -0.1254849071
BusDayEventNum_Pct50 -0.3111986196 -0.80294590 -0.024408294 -0.2792508785
BusDayEventNum_Pct75 -0.3817012137 -0.70633309 -0.082630086 -0.3356870531
BusDayEventNum_Pct90 -0.3940217542 -0.65028732 -0.093834131 -0.3715256112
StopSequence_Mean -0.6062202976 -0.63368485 -0.000250384 0.3681810417
StopSequence_Pct10 -0.5104512059 -0.55305750 0.131422129 0.4549991039
StopSequence_Pct25 -0.5870433756 -0.61971675 0.064032900 0.4156613851
StopSequence_Pct50 -0.6037565277 -0.63004661 0.011954416 0.3789532825
StopSequence_Pct75 -0.6060170236 -0.63112642 -0.017434449 0.3547384747
StopSequence_Pct90 -0.6056307545 -0.63241456 -0.027117609 0.3373818795
EventTimeHr_Mean 0.1409841911 -0.45123485 -0.434425667 -0.5392611879
EventTimeHr_Pct10 0.3650802311 -0.18481759 0.140564337 -0.0005859469
EventTimeHr_Pct25 0.1812387365 -0.34043913 -0.153840693 -0.1319928923
EventTimeHr_Pct50 0.0998863834 -0.23186174 -0.460244684 -0.4198494889
EventTimeHr_Pct75 -0.0496537949 -0.34904266 -0.464505273 -0.6021833375
EventTimeHr_Pct90 -0.1238963525 -0.40981359 -0.226375988 -0.6201279862
DwellTime2_Mean 0.8761725891 -0.08851345 0.023548293 -0.0391522334
DwellTime2_Pct10 0.7030526682 -0.51973983 0.023351765 0.1159456061
DwellTime2_Pct25 0.7030526682 -0.51973983 0.023351765 0.1159456061
DwellTime2_Pct50 0.7598191952 -0.45628425 0.033756487 0.1016314507
DwellTime2_Pct75 0.8567433124 -0.36032024 0.060406952 0.0661690571
Dim.5 Dim.6 Dim.7 Dim.8
BusDayEventNum_Mean 0.1620448429 -0.1237663077 0.286822488 -0.035185407
BusDayEventNum_Pct10 -0.2354867822 -0.0286951206 0.308376700 0.131303136
BusDayEventNum_Pct25 -0.0830976196 0.0006196242 0.301383902 0.032575226
BusDayEventNum_Pct50 0.1203671065 -0.0730595045 0.281720681 -0.090185116
BusDayEventNum_Pct75 0.2228990527 -0.1814747521 0.271018677 -0.059087140
BusDayEventNum_Pct90 0.2790500120 -0.1435994235 0.252129115 -0.024885637
StopSequence_Mean 0.1502459692 0.1660744495 -0.189215347 0.006633515
StopSequence_Pct10 0.0891931972 0.2237366241 -0.097986712 -0.026684850
StopSequence_Pct25 0.1212047481 0.2075489453 -0.147387480 -0.002888018
StopSequence_Pct50 0.1409645113 0.1766307694 -0.184145228 0.002573750
StopSequence_Pct75 0.1561083910 0.1569403785 -0.196455551 0.009423167
StopSequence_Pct90 0.1621689839 0.1479344469 -0.203929297 0.012425368
EventTimeHr_Mean -0.2976406788 0.3611790281 -0.217197870 -0.035433483
EventTimeHr_Pct10 -0.5875325743 0.4544560234 0.263701295 -0.048631537
EventTimeHr_Pct25 -0.5127704888 0.5356588751 0.145248857 -0.145064347
EventTimeHr_Pct50 -0.2372086273 0.3844329548 -0.233176335 -0.032996948
EventTimeHr_Pct75 0.1182887213 -0.0303309505 -0.428088045 0.049596566
EventTimeHr_Pct90 0.2803410616 -0.1150452357 -0.357162484 0.000334219
DwellTime2_Mean 0.3265850529 0.0121630989 0.009739963 -0.102156699
DwellTime2_Pct10 -0.2636000040 -0.2115407053 -0.047004287 0.250779277
DwellTime2_Pct25 -0.2636000040 -0.2115407053 -0.047004287 0.250779277
DwellTime2_Pct50 -0.2179272276 -0.1758810591 -0.087807917 0.232529164
DwellTime2_Pct75 -0.0008099102 -0.1147971824 -0.114338424 -0.033366524
Dim.9 Dim.10 Dim.11 Dim.12
BusDayEventNum_Mean 0.0813826783 -0.081069520 0.0462348712 -0.024349934
BusDayEventNum_Pct10 0.0413449850 -0.005017046 -0.0338773772 0.202582359
BusDayEventNum_Pct25 -0.0090309577 -0.014580463 -0.0014415492 0.194249180
BusDayEventNum_Pct50 0.0322746072 -0.094078694 0.0457375013 -0.029453587
BusDayEventNum_Pct75 0.1155319368 -0.103400293 0.0751108534 -0.094511011
BusDayEventNum_Pct90 0.1053523749 -0.097423611 0.0395490958 -0.090456495
StopSequence_Mean 0.0384816056 0.021593551 0.0004048936 0.015622886
StopSequence_Pct10 0.0001000936 -0.009691535 0.0124519915 -0.054353562
StopSequence_Pct25 0.0167808943 0.006077356 0.0125369547 -0.018280995
StopSequence_Pct50 0.0387279886 0.015011648 0.0033734128 0.007930651
StopSequence_Pct75 0.0420721101 0.026003417 -0.0026818750 0.020028619
StopSequence_Pct90 0.0427386307 0.028961923 -0.0015467238 0.027909000
EventTimeHr_Mean -0.0673379520 0.031012680 -0.0165592080 0.005965818
EventTimeHr_Pct10 -0.0006979970 -0.017882118 -0.0190645441 0.048214425
EventTimeHr_Pct25 -0.1885746677 0.072593235 0.0726647407 0.155906584
EventTimeHr_Pct50 0.0974080030 -0.200682500 -0.0094097458 -0.411753703
EventTimeHr_Pct75 -0.0084453835 0.172797161 -0.0749370011 0.118497628
EventTimeHr_Pct90 -0.1158461142 0.289621254 -0.1227874388 0.112833597
DwellTime2_Mean -0.1183567247 -0.050082868 0.0274029577 0.095865161
DwellTime2_Pct10 0.1206958380 0.076920817 -0.0376909074 -0.056673828
DwellTime2_Pct25 0.1206958380 0.076920817 -0.0376909074 -0.056673828
DwellTime2_Pct50 0.1081772424 0.019181631 -0.1150009473 0.014517843
DwellTime2_Pct75 0.0715248085 0.077271116 -0.0558586129 0.037998358
Dim.13 Dim.14 Dim.15 Dim.16
BusDayEventNum_Mean 0.0005167747 -0.047570404 0.002425721 -0.0066602099
BusDayEventNum_Pct10 -0.1397450859 0.299768981 0.029121141 0.0594026800
BusDayEventNum_Pct25 0.0932902007 0.204984730 -0.068735407 0.0285834721
BusDayEventNum_Pct50 0.1045937783 0.010936786 -0.023111199 0.0587555818
BusDayEventNum_Pct75 -0.0141755060 -0.123237654 -0.022438387 -0.0190888355
BusDayEventNum_Pct90 -0.0757941294 -0.168236297 0.079411491 -0.0885155840
StopSequence_Mean -0.0240452988 0.011363881 -0.014966637 -0.0737578689
StopSequence_Pct10 0.0518286399 -0.106219129 0.111863344 0.3171542212
StopSequence_Pct25 0.0070811784 -0.034301981 0.031199729 0.0635940150
StopSequence_Pct50 -0.0169389599 0.011063201 -0.004932205 -0.0636781924
StopSequence_Pct75 -0.0312723530 0.027802520 -0.024992556 -0.1129812048
StopSequence_Pct90 -0.0372327148 0.026953963 -0.036298428 -0.1321627299
EventTimeHr_Mean -0.0307188539 -0.039769118 0.004635036 0.0228034928
EventTimeHr_Pct10 -0.3868700922 -0.149147121 0.047434196 -0.0036960152
EventTimeHr_Pct25 0.3279595796 -0.136473389 -0.032616028 -0.0870215474
EventTimeHr_Pct50 0.0491970084 0.241784024 0.060255232 0.0002987279
EventTimeHr_Pct75 -0.0894314375 -0.022018923 -0.017960172 0.0700084976
EventTimeHr_Pct90 -0.0392403183 -0.014684949 -0.032214959 0.0278113336
DwellTime2_Mean 0.0468692445 0.056696376 0.209275868 -0.0183507211
DwellTime2_Pct10 0.0635984540 -0.049637715 0.030751717 -0.0126587792
DwellTime2_Pct25 0.0635984540 -0.049637715 0.030751717 -0.0126587792
DwellTime2_Pct50 0.0723555858 -0.050133150 0.122463044 -0.0228010059
DwellTime2_Pct75 0.0455601347 0.002206588 0.212127496 -0.0698035974
Dim.17 Dim.18 Dim.19 Dim.20
BusDayEventNum_Mean 0.004053993 0.003812342 0.0000452259 0.004920803
BusDayEventNum_Pct10 -0.006711456 0.075161098 0.0651945705 -0.047691882
BusDayEventNum_Pct25 0.040296780 -0.031055152 -0.0386109315 -0.008945123
BusDayEventNum_Pct50 -0.042797025 -0.124295648 -0.0275366173 0.049705322
BusDayEventNum_Pct75 -0.018581623 0.003205910 -0.0031442731 -0.017666554
BusDayEventNum_Pct90 0.052942102 0.119917036 0.0128190309 -0.019488049
StopSequence_Mean -0.009303088 -0.009559360 0.0119120365 0.016866209
StopSequence_Pct10 0.045349275 0.045543172 -0.0459816126 -0.049385578
StopSequence_Pct25 -0.002067620 0.020114063 0.0007124121 -0.007712216
StopSequence_Pct50 -0.017457007 -0.003956252 0.0123207652 0.013197655
StopSequence_Pct75 -0.009535570 -0.020114558 0.0122675119 0.023013928
StopSequence_Pct90 -0.011506402 -0.016756976 0.0173595805 0.027994258
EventTimeHr_Mean -0.013535163 -0.054116912 0.0262138237 -0.002274056
EventTimeHr_Pct10 -0.042868499 -0.037564564 -0.0423537688 0.049825363
EventTimeHr_Pct25 0.051981139 0.066097079 -0.0013513232 -0.020813588
EventTimeHr_Pct50 -0.030557635 0.006462641 0.0086339618 0.005666899
EventTimeHr_Pct75 0.004644992 0.017113443 0.0148970297 0.008548211
EventTimeHr_Pct90 -0.009177214 0.029866953 -0.0385430889 -0.017418715
DwellTime2_Mean -0.019303612 0.029122928 -0.0144632230 0.089022119
DwellTime2_Pct10 0.013710612 0.023812384 0.0225802785 0.031484673
DwellTime2_Pct25 0.013710612 0.023812384 0.0225802785 0.031484673
DwellTime2_Pct50 -0.038827136 -0.028228576 -0.0305634324 0.024951143
DwellTime2_Pct75 -0.047699538 -0.050300691 -0.0435822558 -0.036105695
Dim.21 Dim.22 Dim.23 Dim.24
BusDayEventNum_Mean -6.310695e-03 -0.0023823659 0.003722401 0.0078284159
BusDayEventNum_Pct10 1.025764e-01 0.0815291755 0.063211948 -0.0003330823
BusDayEventNum_Pct25 -1.165234e-01 -0.1144493069 -0.089247650 -0.0592217411
BusDayEventNum_Pct50 -6.822457e-02 0.0457840689 0.033324863 0.1030950985
BusDayEventNum_Pct75 1.338647e-02 0.0212541475 0.069910983 -0.0132569872
BusDayEventNum_Pct90 5.448453e-02 -0.0378843414 -0.055390732 -0.0382406124
StopSequence_Mean 2.141301e-03 -0.0008767539 0.001473267 0.0022539848
StopSequence_Pct10 -9.429553e-05 -0.0072540956 -0.012726560 -0.0102877892
StopSequence_Pct25 1.036487e-03 0.0101805796 0.006022349 0.0062252760
StopSequence_Pct50 6.041747e-03 0.0045960029 0.002553015 0.0077275469
StopSequence_Pct75 1.262483e-03 -0.0005165235 -0.001221804 0.0039562629
StopSequence_Pct90 -3.415561e-03 -0.0017967417 -0.002802168 -0.0007275000
EventTimeHr_Mean -1.760218e-02 0.0082595781 0.005773987 0.0195931212
EventTimeHr_Pct10 -2.479089e-02 -0.0135454891 -0.038699425 0.0122454257
EventTimeHr_Pct25 5.125354e-02 0.0212507063 0.028577039 -0.0095561036
EventTimeHr_Pct50 7.033257e-03 0.0113720747 -0.023812393 -0.0340671019
EventTimeHr_Pct75 3.936493e-02 -0.1359559053 0.092639975 -0.0035091885
EventTimeHr_Pct90 -2.571477e-02 0.1269598276 -0.097181072 0.0078862915
DwellTime2_Mean 6.229252e-02 -0.0137054563 -0.041384412 0.0327874766
DwellTime2_Pct10 1.412795e-02 0.0101168179 -0.042555885 0.0106898753
DwellTime2_Pct25 1.412795e-02 0.0101168179 -0.042555885 0.0106898753
DwellTime2_Pct50 -1.759360e-02 -0.0065513421 0.029210609 -0.0084795311
DwellTime2_Pct75 -8.191325e-02 -0.0189190228 0.067313100 -0.0205330465
Dim.25 Dim.26 Dim.27 Dim.28
BusDayEventNum_Mean 2.274769e-03 0.0022549762 -3.129965e-03 -0.0090042162
BusDayEventNum_Pct10 1.333295e-02 -0.0084258722 7.394671e-03 -0.0099151989
BusDayEventNum_Pct25 -7.485269e-03 0.0021770983 6.330476e-03 0.0191183228
BusDayEventNum_Pct50 -7.234523e-03 -0.0111398481 -4.003526e-02 -0.0461204114
BusDayEventNum_Pct75 -3.381194e-02 0.0586292399 3.304682e-02 0.0871190342
BusDayEventNum_Pct90 4.230303e-02 -0.0409194913 2.141173e-03 -0.0511728240
StopSequence_Mean -1.965571e-03 0.0005382572 -2.467380e-03 0.0010152450
StopSequence_Pct10 3.196022e-03 0.0064521814 3.436764e-03 -0.0011476906
StopSequence_Pct25 -5.595126e-05 -0.0004687659 1.461262e-03 0.0035162963
StopSequence_Pct50 1.166179e-03 -0.0022671642 8.910097e-05 0.0030037459
StopSequence_Pct75 -3.367941e-03 -0.0005201314 -5.233024e-03 0.0001038537
StopSequence_Pct90 -4.711327e-03 0.0005565750 -5.749750e-03 0.0001517165
EventTimeHr_Mean -2.654675e-02 -0.0567252434 4.393663e-02 -0.0323942338
EventTimeHr_Pct10 4.000136e-03 0.0310441236 -1.285666e-02 0.0128737330
EventTimeHr_Pct25 1.229891e-02 0.0226937123 -1.156552e-02 0.0055517463
EventTimeHr_Pct50 1.305587e-02 0.0143256285 -1.270386e-02 0.0195568668
EventTimeHr_Pct75 -2.981156e-02 -0.0127535911 -3.550813e-02 0.0070284952
EventTimeHr_Pct90 3.705157e-02 0.0403544364 9.864400e-03 0.0157652542
DwellTime2_Mean -2.650585e-03 0.0085754261 -9.444301e-02 0.0290588516
DwellTime2_Pct10 -6.170268e-02 -0.0079264619 7.303716e-03 -0.0019073143
DwellTime2_Pct25 -6.170268e-02 -0.0079264619 7.303716e-03 -0.0019073143
DwellTime2_Pct50 -5.970478e-03 0.0074308014 1.965785e-02 0.0156426314
DwellTime2_Pct75 9.558740e-02 0.0219012201 3.372714e-02 -0.0201269523
Dim.29 Dim.30 Dim.31 Dim.32
BusDayEventNum_Mean -0.0041916024 -0.0047035001 0.0013576329 0.0020875101
BusDayEventNum_Pct10 0.0007922342 0.0017536768 0.0024201627 0.0036644461
BusDayEventNum_Pct25 0.0123816341 -0.0019577622 -0.0050173112 -0.0057060343
BusDayEventNum_Pct50 -0.0374595540 0.0118800870 0.0016486313 0.0021380006
BusDayEventNum_Pct75 0.0240349995 -0.0260657217 0.0065424480 -0.0039833281
BusDayEventNum_Pct90 0.0052919068 0.0231202960 -0.0113767398 0.0044202755
StopSequence_Mean 0.0022701521 -0.0014270289 0.0032879882 0.0066635155
StopSequence_Pct10 0.0019227195 -0.0075530544 0.0167257013 0.0351970398
StopSequence_Pct25 -0.0030686694 0.0093150099 -0.0309898265 -0.0744584980
StopSequence_Pct50 0.0065783326 0.0008642006 -0.0177539581 -0.0246282656
StopSequence_Pct75 -0.0009191151 -0.0044454775 0.0122153215 0.0203658402
StopSequence_Pct90 -0.0012335055 -0.0024593334 0.0167940718 0.0407345523
EventTimeHr_Mean 0.0765277937 -0.0545117383 -0.0024338661 -0.0020714705
EventTimeHr_Pct10 -0.0219921000 0.0111865131 0.0024188066 -0.0013087163
EventTimeHr_Pct25 -0.0187581252 0.0152738660 0.0022742693 0.0027029130
EventTimeHr_Pct50 -0.0203390185 0.0201581873 0.0002943196 0.0018295472
EventTimeHr_Pct75 -0.0320036386 0.0259483826 0.0065242035 -0.0035861908
EventTimeHr_Pct90 -0.0178294381 0.0020603224 -0.0055413484 0.0018972260
DwellTime2_Mean 0.0476316972 -0.0261889632 -0.0006741801 -0.0031350598
DwellTime2_Pct10 -0.0094901336 0.0035665255 0.0292524947 -0.0100339033
DwellTime2_Pct25 -0.0094901336 0.0035665255 0.0292524947 -0.0100339033
DwellTime2_Pct50 -0.0161521672 -0.0014006964 -0.0816016644 0.0312323996
DwellTime2_Pct75 0.0052308677 0.0023356343 0.0402489533 -0.0152891208
Dim.33 Dim.34 Dim.35 Dim.36
BusDayEventNum_Mean 0.0019174296 0.0001632924 1.180630e-02 7.934989e-05
BusDayEventNum_Pct10 0.0040498849 0.0008986878 -7.715742e-04 3.179520e-04
BusDayEventNum_Pct25 -0.0051287684 -0.0013097693 -2.009666e-03 9.282942e-06
BusDayEventNum_Pct50 0.0059335725 0.0011132454 -5.207584e-03 -6.753900e-04
BusDayEventNum_Pct75 -0.0200414436 -0.0008277654 2.301734e-04 1.991687e-04
BusDayEventNum_Pct90 0.0150167531 -0.0012797417 -4.875106e-03 3.612897e-05
StopSequence_Mean -0.0012949113 0.0005160241 5.337299e-04 8.744366e-04
StopSequence_Pct10 -0.0008471353 0.0002321344 1.492560e-03 -4.315794e-03
StopSequence_Pct25 0.0042135732 -0.0026518637 -2.038217e-03 2.439069e-02
StopSequence_Pct50 -0.0007762503 -0.0014603898 -5.736917e-03 -3.912882e-02
StopSequence_Pct75 -0.0029224987 0.0015217648 5.199057e-03 -6.228567e-03
StopSequence_Pct90 -0.0001781289 0.0017918090 1.374886e-03 2.479692e-02
EventTimeHr_Mean 0.0169091353 0.0020482488 2.373909e-03 1.387977e-03
EventTimeHr_Pct10 -0.0092632055 -0.0015629600 -3.998048e-04 -4.523724e-04
EventTimeHr_Pct25 -0.0014031995 0.0003230656 -4.000983e-04 -5.500721e-04
EventTimeHr_Pct50 -0.0034199825 -0.0010049657 2.074391e-05 -5.830054e-05
EventTimeHr_Pct75 -0.0057114594 -0.0006949630 9.108066e-04 -1.252353e-03
EventTimeHr_Pct90 -0.0068554462 -0.0008813521 -5.106255e-03 7.478921e-05
DwellTime2_Mean -0.0060020507 0.0012130849 -4.579912e-04 1.151039e-03
DwellTime2_Pct10 -0.0019643683 -0.0035892526 -1.382631e-03 -1.972075e-05
DwellTime2_Pct25 -0.0019643683 -0.0035892526 -1.382631e-03 -1.972075e-05
DwellTime2_Pct50 0.0109535045 0.0120062255 -1.776263e-04 8.877460e-04
DwellTime2_Pct75 -0.0030097804 -0.0083966376 2.866777e-03 -2.249948e-04
Dim.37 Dim.38 Dim.39 Dim.40
BusDayEventNum_Mean 9.566998e-03 -3.363389e-02 -1.863510e-03 -4.617860e-04
BusDayEventNum_Pct10 -5.644794e-04 2.060123e-03 4.694439e-05 4.011383e-05
BusDayEventNum_Pct25 -1.360112e-03 3.564986e-03 1.697625e-04 1.360102e-04
BusDayEventNum_Pct50 -1.285297e-03 1.042292e-02 8.791850e-04 -2.345124e-04
BusDayEventNum_Pct75 -4.074760e-03 9.250762e-03 7.368423e-04 1.228626e-04
BusDayEventNum_Pct90 -1.942326e-03 1.084743e-02 -1.632475e-05 4.953451e-04
StopSequence_Mean -9.087007e-04 -9.412291e-04 6.541437e-04 -2.630648e-04
StopSequence_Pct10 -6.384350e-04 4.634691e-04 6.674373e-04 5.108676e-04
StopSequence_Pct25 5.391859e-05 -5.529682e-04 -3.312383e-03 -1.737062e-03
StopSequence_Pct50 5.271147e-03 -8.992085e-04 1.187709e-02 1.801878e-03
StopSequence_Pct75 -2.010497e-03 3.010653e-03 -2.612996e-02 5.039646e-04
StopSequence_Pct90 -1.747338e-03 -4.642906e-04 1.633786e-02 -8.388692e-04
EventTimeHr_Mean -1.070934e-04 9.776305e-04 -7.624559e-05 -4.575407e-04
EventTimeHr_Pct10 8.373793e-05 6.981350e-05 1.456376e-04 1.412071e-04
EventTimeHr_Pct25 5.643783e-05 -2.865494e-04 -1.006636e-04 5.655169e-05
EventTimeHr_Pct50 -5.803978e-04 -7.935715e-04 2.962251e-05 3.213865e-04
EventTimeHr_Pct75 -8.439012e-04 8.317391e-05 2.851096e-04 1.961147e-05
EventTimeHr_Pct90 9.874299e-04 -3.435562e-04 -3.745431e-04 1.607303e-04
DwellTime2_Mean -1.220655e-03 -8.832596e-05 7.004446e-05 -4.934088e-04
DwellTime2_Pct10 6.545549e-03 1.424356e-03 1.965091e-04 3.571272e-04
DwellTime2_Pct25 6.545549e-03 1.424356e-03 1.965091e-04 3.571272e-04
DwellTime2_Pct50 -2.610665e-03 -9.572892e-04 -1.084876e-03 -9.380582e-04
DwellTime2_Pct75 -1.422715e-03 1.470083e-05 6.958196e-04 6.322145e-04
Dim.41 Dim.42
BusDayEventNum_Mean -2.744270e-04 0.000000e+00
BusDayEventNum_Pct10 -5.063735e-06 2.923130e-31
BusDayEventNum_Pct25 1.389066e-05 -3.188672e-31
BusDayEventNum_Pct50 1.647166e-04 -1.813510e-31
BusDayEventNum_Pct75 -1.740587e-05 2.823621e-31
BusDayEventNum_Pct90 1.305841e-04 2.942013e-32
StopSequence_Mean 1.111001e-02 -6.048767e-32
StopSequence_Pct10 -3.732037e-04 4.220146e-31
StopSequence_Pct25 -1.303292e-03 -1.880061e-31
StopSequence_Pct50 -2.471636e-03 1.104379e-31
StopSequence_Pct75 -3.130287e-03 -6.970740e-32
StopSequence_Pct90 -3.926392e-03 -1.002304e-31
EventTimeHr_Mean -5.052287e-05 1.123309e-31
EventTimeHr_Pct10 1.745821e-05 -2.717004e-31
EventTimeHr_Pct25 1.158883e-05 2.292590e-31
EventTimeHr_Pct50 5.550145e-05 1.339744e-32
EventTimeHr_Pct75 -6.726525e-05 -4.833701e-32
EventTimeHr_Pct90 7.423506e-05 -8.943109e-32
DwellTime2_Mean -1.100660e-05 4.376558e-31
DwellTime2_Pct10 4.840789e-05 -1.817734e-16
DwellTime2_Pct25 4.840789e-05 1.817734e-16
DwellTime2_Pct50 5.752855e-05 6.421522e-32
DwellTime2_Pct75 -7.283262e-05 -2.455485e-32
[ reached getOption("max.print") -- omitted 19 rows ]
# Graph of the Factor-Variable Map
fviz_pca_var(PcaRes,
col.var = "contrib"
) +
scale_color_gradient2(low = "white",
mid = "blue",
high = "red",
midpoint = 2
)
# Graph of the Factor-Variable Map (top 10 contributing variables)
fviz_pca_var(PcaRes,
col.var = "contrib",
select.var = list(contrib = 10)
) +
scale_color_gradient2(low = "white",
mid = "blue",
high = "red",
midpoint = 3.8
)
PcaRes_Rtes <- get_pca_ind(PcaRes)
PcaRes_Rtes
Principal Component Analysis Results for individuals
===================================================
Name Description
1 "$coord" "Coordinates for the individuals"
2 "$cos2" "Cos2 for the individuals"
3 "$contrib" "contributions of the individuals"
# Where routes lie in relation to the eigenvectors
PcaRes_Rtes$coord
Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6
10A -1.542836562 -1.16606749 0.733765527 0.057073840 0.590293129 -0.269474784
10B -2.174899598 -2.53491936 0.733466454 0.688172993 0.669526180 0.494469415
10E -0.463521733 1.65146836 -1.178721490 0.337496580 -0.648636145 0.260889810
11Y 0.098927711 1.17610835 -2.083752449 0.773359544 -0.516913593 1.115537010
15K -1.043392379 0.30982796 -1.210203039 -0.029434585 -0.460845248 -0.130844848
15L -1.894257885 1.62934766 4.398957144 7.212378021 -0.104278063 0.210713042
16A -1.435101124 -1.62802533 1.448749115 0.524381702 -0.001912895 0.704811101
16B -0.788573489 -0.27880075 -2.718996161 -1.625838089 -0.099908303 -0.885757776
16E -1.922710359 1.64106507 -4.312513228 1.577890278 3.320449289 -5.795946883
16G 0.060304638 0.08737982 2.874418347 -2.364824129 -0.354269527 -0.194825770
16H 0.236105907 1.70906049 0.656357075 -0.864689093 -0.729125000 -0.046153278
16J -0.285626404 -0.14184890 1.496359076 -0.288857495 -0.637278511 0.036502493
16L -0.810616482 0.66969878 -2.103607144 0.864761642 -0.654968161 0.526228622
16X 1.828629979 3.68928326 -0.306993995 0.470061832 -0.781990660 0.179404703
16Y 2.399925187 3.62975175 0.680504819 -0.615273989 -1.166996087 0.822837198
17A -2.006100069 -1.51481085 -5.164208278 1.664709429 -0.680549043 0.398939090
17B -2.353817207 -2.31352049 -7.565353870 3.281847281 -1.797837271 1.541832049
17F 0.480565180 2.84099795 -1.152850541 -0.171283324 -1.638291450 -0.694490494
17G -0.408234254 0.04980661 -2.493250807 -0.460998538 -2.121401980 2.087848459
17H -0.814605999 1.27136745 -0.532942392 0.710896403 -0.193748099 -0.005766723
17K -0.397241210 1.30236239 -2.277268263 0.029700607 -0.706863834 -0.138775928
17L 0.238497659 2.21804711 -1.828323379 0.511129338 -0.543385946 0.351670925
17M -0.746352768 0.11407475 -2.114505818 -0.207910583 -2.079078219 1.903358765
Dim.7 Dim.8 Dim.9 Dim.10 Dim.11
10A -0.540367721 1.551212e-01 -0.328117744 0.656714911 -0.4170391975
10B -0.529313879 4.005257e-02 -0.299677622 0.687140405 -0.3350517536
10E -2.120380643 1.259242e-01 1.257079027 -0.888086064 0.5883003707
11Y -2.119144999 1.798144e-01 0.484448941 -0.283582271 0.1424017421
15K -0.638335829 2.919903e-01 0.109018868 -0.406269694 0.0730366667
15L 1.517671710 5.944777e-01 0.019794831 -0.885017840 0.3695757808
16A -0.488284743 8.642134e-02 -0.572554465 0.806854117 -0.5809893915
16B -0.980745758 2.327846e-02 0.024616625 0.054566496 -0.0165284480
16E -1.316861978 7.235517e-01 0.455887132 1.562106321 -0.1353112953
16G 0.071793936 -3.005923e-02 -0.340220097 0.638041536 -0.6164829761
16H -1.328478608 1.614517e-02 0.882968488 -0.654285415 0.4305184983
16J 0.543467272 1.992937e-03 -0.769694179 0.583212200 -0.5108630660
16L -1.714113165 3.794170e-01 0.227432067 -0.335961855 0.1314186713
16X -1.622654968 1.713166e-01 0.200780269 0.010452167 -0.1300623128
16Y -2.164718095 7.978942e-03 0.558469942 0.291329602 -0.2839775307
17A -1.180469061 5.280818e-01 -1.198278691 0.495337044 -0.5078286445
17B -0.518191330 -2.330708e-02 -1.887760616 0.953746011 -0.3167097852
17F 0.274475417 4.866159e-01 0.260293836 -0.740756620 0.2753899630
17G -1.055669614 -4.333289e-01 0.286616090 -0.547564952 0.7090156954
17H -1.547953601 4.093002e-01 0.627720359 -0.546435124 0.2382929880
17K -1.550774702 3.063733e-01 0.759985580 -0.925001096 0.5097783613
17L -1.582955879 5.530625e-01 0.666288443 -0.615716064 0.1234530057
17M -1.118511068 -2.352449e-01 0.035033068 -0.361935228 0.5903282113
Dim.12 Dim.13 Dim.14 Dim.15 Dim.16
10A 0.1085083078 0.1672523070 0.3486424017 -0.449362680 -0.038659885
10B 0.3629438935 -0.0023235871 0.2499113792 -0.124767481 -0.052584070
10E -0.3073345989 -0.2423335539 0.2307324710 -0.183325899 -0.272525859
11Y -0.2544707284 -0.5281010223 0.0828313408 0.051742551 -0.393440792
15K -0.2972289148 0.0437322222 0.4301664690 -0.209985371 -0.058811147
15L -0.5707953689 0.6698126081 0.4174604490 -0.083650782 -0.708826004
16A 0.3295214811 -0.2402660029 0.4247855424 -0.154346949 -0.001074633
16B 0.1199254929 -0.4967669202 0.0236699419 0.079140626 0.644575589
16E 2.9484655351 -1.0431595377 -1.9240599248 -0.924242048 1.110643904
16G -0.0691715294 0.3760164819 0.4637970731 -0.450519573 0.235878185
16H -0.1721855700 0.0538656280 0.2529483405 0.124584407 -0.119143975
16J 0.1869807429 0.0242267971 0.6174463618 -0.143221684 0.135550323
16L -0.2379391493 -0.2561391109 0.6667982400 -0.133839204 -0.334947572
16X -0.2636414956 -0.1175255861 1.0433250676 -0.192102821 -0.104018091
16Y 0.0534781633 -0.1237712268 0.6794705460 -0.007505770 0.130256379
17A -0.0540582525 -1.1411471510 0.7224590339 0.224723798 0.543993758
17B 1.1977892244 -0.6789405799 -0.3654368012 -0.068420682 -0.448087392
17F 0.0288690312 -0.2676161572 0.3441671009 -0.067726885 0.549610548
17G 0.8335888212 1.5453371477 -0.5653356017 -0.437291360 -0.475288073
17H -0.7852585489 -0.3336994462 -0.0507602545 -0.049463926 -0.101064126
17K -0.2127585145 -0.2059316181 0.1139681958 -0.039972401 0.329551984
17L -0.5896070173 -0.2243446191 0.2604331894 -0.297960700 0.224934985
17M 0.4871726527 1.2256196756 -0.7206707324 -0.353898032 -0.757894818
Dim.17 Dim.18 Dim.19 Dim.20 Dim.21
10A -0.013989013 -0.049868209 0.061545482 -0.084062650 -0.0983152717
10B 0.008636602 0.042057595 0.022471412 -0.013127038 -0.0021023650
10E -0.119011201 -0.163744553 0.380561610 0.267625834 0.1522980433
11Y -0.196816764 -0.082404663 0.202429902 0.146022130 -0.1590100282
15K 0.257663084 -0.218068072 -0.375916152 0.087211955 -0.2141753651
15L -0.012545901 -0.256813841 0.326222308 0.077214124 0.2502478735
16A -0.215860672 -0.099730270 0.193992005 0.016900928 0.1131454550
16B -0.019170397 -0.354611999 -0.065258351 0.165165811 -0.1289252193
16E -0.484655622 -0.071137668 1.023560381 0.087613501 0.3730182467
16G -0.228927184 0.100673133 0.263636461 -0.238272450 0.0382544747
16H 0.194810282 0.220575660 -0.061929741 -0.008861366 0.1365928158
16J -0.156695072 -0.197701944 0.117131462 -0.089118844 -0.0971720136
16L 0.144383872 -0.062549673 0.163883900 -0.021130362 0.0834282740
16X 0.111294383 -0.164290203 -0.089418314 -0.012024437 -0.4626582575
16Y -0.090510333 0.121393066 -0.274133484 -0.170833748 -0.6993443280
17A 0.145767335 -0.261150332 0.001298097 -0.024317832 0.2764324683
17B -0.031574888 -0.242159763 0.248602436 0.173409802 0.5314320374
17F 0.229085883 -0.111440155 0.325356370 -0.300834524 0.2382675085
17G 0.212139340 0.091445864 0.193084048 -0.034468405 0.1789528124
17H 0.253848088 0.211973463 -0.108651201 -0.195360779 0.1446983544
17K -0.114931613 -0.347985921 0.450107227 0.030509389 0.0902583994
17L -0.024864542 -0.071807530 0.475218271 -0.197583546 0.0317659302
17M 0.382004602 0.526932781 0.195730610 -0.120343106 0.7699906035
Dim.22 Dim.23 Dim.24 Dim.25 Dim.26
10A 0.1910511609 0.039952051 0.048352252 -1.206459e-01 -0.094848404
10B 0.1696005577 -0.011715907 0.071547755 -6.368711e-02 0.008063785
10E -0.0296019210 -0.102295642 0.253815378 -6.867801e-02 -0.403296313
11Y 0.0329405654 -0.124179181 -0.002352048 7.619184e-02 -0.161695012
15K -0.1783619880 -0.174330537 -0.098005803 -3.168179e-01 0.074213788
15L 0.2296205655 0.128039391 -0.117697778 -4.499667e-03 0.001470069
16A -0.0680234014 0.165737266 0.086713588 -3.669788e-02 -0.109061735
16B -0.2423812506 -0.115164790 0.057436994 -3.470089e-02 -0.231868686
16E -0.6714655797 0.539770050 -0.554714241 -2.810791e-01 -0.097693441
16G 0.0138043301 0.034392436 -0.076679541 1.039382e-01 -0.242004327
16H -0.2551255472 0.138782046 -0.356604383 -5.900252e-02 -0.126506567
16J -0.1001269000 0.109004624 0.004533672 2.780563e-03 -0.217265763
16L -0.0301108065 -0.247538363 0.094109030 -7.547833e-03 -0.212679374
16X 0.2360426383 0.129288593 -0.183080133 -1.894825e-01 -0.176325794
16Y 0.1563783462 0.306614528 -0.064463497 -2.730040e-01 -0.118265520
17A 0.0819645977 -0.089288592 -0.176905396 -9.529823e-02 -0.149363370
17B -0.0599566134 0.028929108 -0.163394539 5.243189e-02 0.062402939
17F 0.0258402700 0.061349755 -0.141277093 4.552904e-02 0.050163417
17G 0.1811841545 -0.072525505 0.428024877 1.569601e-01 -0.167857070
17H -0.2236006781 0.215991166 -0.087942770 -2.656066e-01 0.348434556
17K -0.1022069446 -0.458441004 0.055546960 2.712463e-01 -0.099490102
17L -0.2463476148 -0.143590115 0.122541269 -6.159793e-02 0.003143265
17M 0.2618337869 0.062126217 -0.117203758 2.158792e-01 0.094389097
Dim.27 Dim.28 Dim.29 Dim.30 Dim.31
10A -1.013436e-02 0.024539295 -0.0358041539 -3.056426e-03 -7.652263e-02
10B -8.881169e-02 0.038354010 -0.0156378260 2.555456e-03 -5.736526e-03
10E 1.057239e-01 0.310522062 -0.0414056828 9.981591e-02 -9.967208e-02
11Y 3.977547e-01 0.051099506 0.1761480887 -1.657971e-01 2.113169e-01
15K -2.375772e-01 -0.077862039 -0.0200442560 9.564503e-02 -8.620410e-02
15L -4.598725e-03 0.074410940 0.0308073194 -2.003688e-02 -3.463860e-02
16A -8.113217e-02 0.106145880 0.0144738901 -5.924624e-02 7.418179e-02
16B -1.012332e-01 0.101719661 -0.1091143838 6.940196e-02 8.747023e-02
16E 2.919003e-01 0.016987499 0.2868972065 -2.221013e-01 9.848382e-02
16G 4.018050e-02 0.047957565 0.0112625275 -5.254871e-02 6.604319e-02
16H -2.151748e-02 -0.016017119 0.1096180105 1.519586e-01 9.296455e-02
16J 1.973718e-02 0.015866061 0.0704226861 -1.197704e-01 8.264083e-02
16L 1.494477e-01 -0.006433867 -0.0517057099 -8.643427e-02 -8.569243e-02
16X -1.970587e-01 0.050607542 -0.1164434843 4.770605e-03 3.142338e-01
16Y -1.183576e-01 -0.065444610 -0.1789311048 -2.441205e-01 -3.970071e-03
17A 3.606205e-02 0.190320800 0.1285290335 -1.188451e-01 9.442775e-04
17B -5.569296e-02 0.344136544 -0.1368148252 -1.683090e-02 2.656214e-01
17F -1.097538e-01 -0.087012718 -0.3890297423 7.298743e-02 6.086590e-02
17G 1.652829e-01 0.132142245 -0.0920719158 -1.003628e-01 2.341681e-02
17H -1.039334e-01 0.165672616 -0.2705183587 2.295024e-02 5.062636e-02
17K 3.653419e-01 0.211204721 0.0236354695 -1.348201e-01 2.221987e-02
17L 3.238312e-02 0.080238408 -0.1724782633 -2.430069e-01 2.727123e-02
17M 1.849442e-01 0.159513046 -0.0745783075 1.745692e-01 -6.270878e-02
Dim.32 Dim.33 Dim.34 Dim.35 Dim.36
10A -0.1223256205 -0.020059746 0.1194390888 0.0543429302 2.867830e-02
10B -0.0015439141 -0.068836548 0.0799320925 0.0313239954 -4.413682e-02
10E -0.1342565896 -0.029934617 -0.0035791000 -0.0646967657 9.698481e-03
11Y 0.1080767888 -0.001744475 -0.0339235106 0.0868923889 1.159281e-01
15K -0.1230649365 0.044419428 0.0279388866 -0.0810218554 -3.067368e-02
15L -0.0598993641 0.037654778 0.0924660485 0.0000845747 -3.596857e-02
16A -0.0265748607 -0.063275076 -0.0331315430 -0.0664700874 -1.763365e-02
16B -0.0322444449 0.041229080 -0.0026382302 -0.0197615717 -4.058804e-03
16E 0.0603459075 -0.049094642 0.0341469870 0.0098914580 2.817218e-02
16G 0.0891736174 0.023679770 -0.0539855128 -0.1553602715 1.076101e-01
16H -0.1256860510 0.084041500 -0.0316745683 -0.0052972243 -1.756423e-02
16J -0.0872764048 0.002034392 -0.0543255669 -0.0567901160 2.412947e-02
16L -0.0116558957 -0.037001103 -0.0339680784 -0.0757795677 -1.185639e-02
16X 0.3287694141 -0.121772793 0.0353983372 0.0412649544 8.775987e-02
16Y -0.0107983969 0.002788849 -0.0652782973 0.0101150727 1.308197e-02
17A -0.0237782035 -0.003399762 0.0155663206 0.0050761569 -5.187226e-02
17B 0.3446517437 0.174104807 0.0614652837 0.1162741038 7.172430e-02
17F -0.0005331575 0.207543459 0.1106501889 -0.0768172155 -4.107588e-02
17G 0.0032972205 0.140240210 0.0159059548 -0.0330534142 -4.040581e-02
17H 0.1181950178 -0.248731411 -0.0258424604 -0.0082373793 9.214950e-04
17K -0.0539215245 0.082287207 -0.0168460649 0.0928107986 8.299477e-02
17L 0.2391192750 -0.162302081 -0.0113720083 0.0429648257 -4.523328e-02
17M -0.0495782125 0.118884975 -0.0052960587 0.0055044837 6.907606e-03
Dim.37 Dim.38 Dim.39 Dim.40 Dim.41
10A 0.0416880305 -5.623081e-03 -0.0269754746 -1.311137e-02 -1.361863e-02
10B 0.0282017940 1.210562e-02 -0.0060365120 -1.065691e-02 1.074055e-02
10E 0.0064114242 1.149354e-02 -0.0049071460 -2.104260e-03 -8.509691e-03
11Y 0.0702982041 -8.662284e-02 -0.0616978887 -1.984622e-02 -1.530568e-02
15K -0.0633117886 1.266739e-02 -0.0154061888 1.091886e-02 9.814985e-04
15L -0.0522812607 -1.349988e-02 0.0223477372 1.718584e-02 -9.140548e-04
16A 0.0135941857 5.562863e-06 -0.0063763248 -5.397158e-03 -1.609453e-02
16B -0.0073879151 3.232388e-02 0.0501574816 -4.955605e-03 3.866150e-03
16E 0.0253005890 -1.283040e-01 0.1012134379 -5.157183e-03 2.818900e-02
16G -0.0222446528 -3.745610e-02 0.0387190467 -1.928684e-02 -2.730989e-02
16H -0.0208155349 -3.671872e-02 0.0187277006 1.214933e-02 1.027403e-02
16J -0.0125374151 -2.574236e-02 0.0137887303 -1.240361e-02 2.514162e-03
16L -0.0043211000 -3.282431e-03 0.0304821163 6.666663e-03 1.076047e-03
16X 0.0797448730 5.515680e-02 -0.0812024851 -8.595630e-03 5.034341e-03
16Y -0.0082263049 -2.080322e-02 0.0317844562 -1.595780e-02 -4.773846e-03
17A -0.0554788580 -4.194835e-02 -0.0038693634 1.460534e-02 1.054139e-02
17B -0.0753540463 7.905749e-02 -0.0946624982 6.917386e-04 8.770804e-03
17F -0.0267404768 7.699470e-02 0.0016387862 -1.820436e-02 -1.801550e-02
17G -0.0025781825 2.631362e-02 0.0042653944 -1.707652e-02 6.987287e-03
17H -0.0682425779 3.419874e-02 -0.0248683173 6.631064e-03 1.433760e-02
17K 0.0129758868 6.158344e-02 0.0167577060 -1.724061e-02 3.464283e-03
17L -0.0245398153 2.305695e-02 -0.0073531128 4.755591e-03 -1.891973e-03
17M -0.0057349445 9.716078e-04 0.0139556034 -6.196076e-04 3.880775e-04
Dim.42
10A -1.623002e-16
10B 4.678794e-16
10E -4.156912e-16
11Y -3.659179e-16
15K -2.211667e-15
15L -1.000575e-15
16A -9.426823e-16
16B -3.699428e-16
16E -1.493855e-15
16G -1.386722e-15
16H 2.746736e-16
16J -1.378688e-15
16L -1.351729e-15
16X -1.361262e-15
16Y -1.755417e-15
17A -3.305707e-16
17B 6.249625e-16
17F -6.862838e-16
17G 2.433032e-15
17H -8.310768e-16
17K 1.210537e-15
17L -5.372654e-16
17M 3.241721e-15
[ reached getOption("max.print") -- omitted 245 rows ]
# Graph of Route Map
fviz_pca_ind(PcaRes,
col.ind="cos2"
) +
scale_color_gradient2(low = "white",
mid = "blue",
high = "red",
midpoint = 0.50
) +
# comment out xlim and ylim to see EXTREME outlier Routes
xlim(-5, 5) +
ylim(-5, 5)
# Graph of Route Map (top 10 contributing variables)
fviz_pca_ind(PcaRes,
col.ind="cos2",
select.var = list(cos2 = 10)
) +
scale_color_gradient2(low = "white",
mid = "blue",
high = "red",
midpoint = 0.50
) +
# comment out xlim and ylim to see EXTREME outlier Routes
xlim(-5, 5) +
ylim(-5, 5)
# Inspecting what looks to be an EXTREME outlier route
View(filter(WaitTime_RteCnts,
Route == "SH99"
)
)
# Biplot of Routes and Variables
fviz_pca_biplot(PcaRes, geom = "text") +
xlim(-5, 5) +
ylim(-5, 5)
# 9 eigenvalues give ~ 90% of the variance
# "elbow" at ~6th Principal Component
# ~ 8 eigenvalues > 1 (PC accounts for more variance than accounted the original standardized variables)
View(get_eigenvalue(PcaRes))
fviz_screeplot(PcaRes, ncp = 15)
fviz_screeplot(PcaRes, ncp = 15, choice = "eigenvalue")
# Create a dataframe for the "top" 8 PCs
RouteStats_Pca_8Eign <- as.data.frame(PcaRes_Rtes$coord) %>%
select(Dim.1,
Dim.2,
Dim.3,
Dim.4,
Dim.5,
Dim.6,
Dim.7,
Dim.8
)
View(RouteStats_Pca_8Eign)
Clustering (using the Principal Components computed using caret::preProcess).
Are the data clusterable?
##### Are the data clusterable?
# gradient_col <- list(low = "steelblue", high = "white")
ClustData_Ends <- get_clust_tendency(RouteStats_Pca,
n = nrow(RouteStats_Pca
) - 1,
# gradient = gradient_col,
seed = 123456789
)
str(ClustData_Ends)
List of 2
$ hopkins_stat: num 0.166
$ plot :List of 9
..$ data :'data.frame': 71824 obs. of 3 variables:
.. ..$ Var1 : Factor w/ 268 levels "r202-","r210-",..: 1 2 3 4 5 6 7 8 9 10 ...
.. ..$ Var2 : Factor w/ 268 levels "r202-","r210-",..: 1 1 1 1 1 1 1 1 1 1 ...
.. ..$ value: num [1:71824] 0 13.7 11.4 10.6 12.9 ...
..$ layers :List of 1
.. ..$ :Classes 'LayerInstance', 'Layer', 'ggproto' <ggproto object: Class LayerInstance, Layer>
aes_params: list
compute_aesthetics: function
compute_geom_1: function
compute_geom_2: function
compute_position: function
compute_statistic: function
data: waiver
draw_geom: function
finish_statistics: function
geom: <ggproto object: Class GeomTile, GeomRect, Geom>
aesthetics: function
default_aes: uneval
draw_group: function
draw_key: function
draw_layer: function
draw_panel: function
extra_params: na.rm width height
handle_na: function
non_missing_aes:
optional_aes:
parameters: function
required_aes: x y
setup_data: function
use_defaults: function
super: <ggproto object: Class GeomRect, Geom>
geom_params: list
inherit.aes: TRUE
layer_data: function
map_statistic: function
mapping: uneval
position: <ggproto object: Class PositionIdentity, Position>
compute_layer: function
compute_panel: function
required_aes:
setup_data: function
setup_params: function
super: <ggproto object: Class Position>
print: function
show.legend: NA
stat: <ggproto object: Class StatIdentity, Stat>
aesthetics: function
compute_group: function
compute_layer: function
compute_panel: function
default_aes: uneval
extra_params: na.rm
finish_layer: function
non_missing_aes:
parameters: function
required_aes:
retransform: TRUE
setup_data: function
setup_params: function
super: <ggproto object: Class Stat>
stat_params: list
subset: NULL
super: <ggproto object: Class Layer>
..$ scales :Classes 'ScalesList', 'ggproto' <ggproto object: Class ScalesList>
add: function
clone: function
find: function
get_scales: function
has_scale: function
input: function
n: function
non_position_scales: function
scales: list
super: <ggproto object: Class ScalesList>
..$ mapping :List of 2
.. ..$ x: symbol Var1
.. ..$ y: symbol Var2
..$ theme :List of 4
.. ..$ axis.title.x: list()
.. .. ..- attr(*, "class")= chr [1:2] "element_blank" "element"
.. ..$ axis.title.y: list()
.. .. ..- attr(*, "class")= chr [1:2] "element_blank" "element"
.. ..$ axis.text : list()
.. .. ..- attr(*, "class")= chr [1:2] "element_blank" "element"
.. ..$ axis.ticks : list()
.. .. ..- attr(*, "class")= chr [1:2] "element_blank" "element"
.. ..- attr(*, "class")= chr [1:2] "theme" "gg"
.. ..- attr(*, "complete")= logi FALSE
.. ..- attr(*, "validate")= logi FALSE
..$ coordinates:Classes 'CoordCartesian', 'Coord', 'ggproto' <ggproto object: Class CoordCartesian, Coord>
aspect: function
distance: function
expand: TRUE
is_linear: function
labels: function
limits: list
range: function
render_axis_h: function
render_axis_v: function
render_bg: function
render_fg: function
train: function
transform: function
super: <ggproto object: Class CoordCartesian, Coord>
..$ facet :Classes 'FacetNull', 'Facet', 'ggproto' <ggproto object: Class FacetNull, Facet>
compute_layout: function
draw_back: function
draw_front: function
draw_labels: function
draw_panels: function
finish_data: function
init_scales: function
map: function
map_data: function
params: list
render_back: function
render_front: function
render_panels: function
setup_data: function
setup_params: function
shrink: TRUE
train: function
train_positions: function
train_scales: function
vars: function
super: <ggproto object: Class FacetNull, Facet>
..$ plot_env :<environment: 0x11fc27578>
..$ labels :List of 3
.. ..$ x : chr "Var1"
.. ..$ y : chr "Var2"
.. ..$ fill: chr "value"
..- attr(*, "class")= chr [1:2] "gg" "ggplot"
# Hopkins statistic
ClustData_Ends$hopkins_stat # value of 0.1657494 implies that the data are not uniformly distributed (they are "clusterable")
[1] 0.1657494
#plot
ClustData_Ends$plot
Clustering. How many clusters are there?
kmeans, pam, and hierarchical clustring methods, using within sum of squares and silhouette measures.
# class(RouteStats_Pca)
fviz_nbclust(RouteStats_Pca, kmeans, method = "wss") # ~8 clusters
fviz_nbclust(RouteStats_Pca, pam, method = "wss") # ~6 clusters
fviz_nbclust(RouteStats_Pca, hcut, method = "wss") # ~6 clusters
fviz_nbclust(RouteStats_Pca, kmeans, method = "silhouette") # 2 clusters
fviz_nbclust(RouteStats_Pca, pam, method = "silhouette") # 2 clusters
fviz_nbclust(RouteStats_Pca, hcut, method = "silhouette",
hc_method = "complete") # 2 clusters
Clustering. How many clusters are there?
kmeans method with the gap statistic, using bootstrap.
# Compute gap statistic
# kmeans version
set.seed(123456789)
# system.time(
gap_stat_km <- clusGap(RouteStats_Pca,
FUN = kmeans,
nstart = 25,
K.max = 10,
B = 500
)
Clustering k = 1,2,..., K.max (= 10): .. done
Bootstrapping, b = 1,2,..., B (= 500) [one "." per sample]:
.................................................. 50
.................................................. 100
.................................................. 150
.................................................. 200
.................................................. 250
.................................................. 300
.................................................. 350
.................................................. 400
.................................................. 450
.................................................. 500
# )
# Print
print(gap_stat_km, method = "Tibs2001SEmax")
Clustering Gap statistic ["clusGap"] from call:
clusGap(x = RouteStats_Pca, FUNcluster = kmeans, K.max = 10, B = 500, nstart = 25)
B=500 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
--> Number of clusters (method 'Tibs2001SEmax', SE.factor=1): 1
logW E.logW gap SE.sim
[1,] 6.273354 7.317537 1.044183 0.011563071
[2,] 6.141046 7.189388 1.048342 0.010769264
[3,] 6.104208 7.105363 1.001155 0.009460871
[4,] 6.027683 7.047106 1.019423 0.009071434
[5,] 5.969429 7.012677 1.043248 0.008658388
[6,] 5.924133 6.983507 1.059374 0.008524205
[7,] 5.890708 6.958233 1.067525 0.008415261
[8,] 5.863342 6.936076 1.072734 0.008331378
[9,] 5.839159 6.916587 1.077427 0.008341695
[10,] 5.810654 6.899180 1.088526 0.008325652
print(gap_stat_km)
Clustering Gap statistic ["clusGap"] from call:
clusGap(x = RouteStats_Pca, FUNcluster = kmeans, K.max = 10, B = 500, nstart = 25)
B=500 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
--> Number of clusters (method 'firstSEmax', SE.factor=1): 1
logW E.logW gap SE.sim
[1,] 6.273354 7.317537 1.044183 0.011563071
[2,] 6.141046 7.189388 1.048342 0.010769264
[3,] 6.104208 7.105363 1.001155 0.009460871
[4,] 6.027683 7.047106 1.019423 0.009071434
[5,] 5.969429 7.012677 1.043248 0.008658388
[6,] 5.924133 6.983507 1.059374 0.008524205
[7,] 5.890708 6.958233 1.067525 0.008415261
[8,] 5.863342 6.936076 1.072734 0.008331378
[9,] 5.839159 6.916587 1.077427 0.008341695
[10,] 5.810654 6.899180 1.088526 0.008325652
# pam version
set.seed(123456789)
gap_stat_pm <- clusGap(RouteStats_Pca,
FUN = pam,
K.max = 10,
B = 500
)
Clustering k = 1,2,..., K.max (= 10): .. done
Bootstrapping, b = 1,2,..., B (= 500) [one "." per sample]:
.................................................. 50
.................................................. 100
.................................................. 150
.................................................. 200
.................................................. 250
.................................................. 300
.................................................. 350
.................................................. 400
.................................................. 450
.................................................. 500
# Print
print(gap_stat_pm, method = "Tibs2001SEmax")
Clustering Gap statistic ["clusGap"] from call:
clusGap(x = RouteStats_Pca, FUNcluster = pam, K.max = 10, B = 500)
B=500 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
--> Number of clusters (method 'Tibs2001SEmax', SE.factor=1): 2
logW E.logW gap SE.sim
[1,] 6.273354 7.317222 1.043868 0.01161238
[2,] 6.140024 7.208149 1.068125 0.01495862
[3,] 6.087381 7.126041 1.038659 0.01414766
[4,] 6.017737 7.076534 1.058796 0.01469932
[5,] 5.991660 7.043313 1.051652 0.01203089
[6,] 5.943989 7.016020 1.072030 0.01149023
[7,] 5.922939 6.992303 1.069363 0.01153694
[8,] 5.888683 6.971776 1.083093 0.01124332
[9,] 5.863290 6.953980 1.090690 0.01107343
[10,] 5.843109 6.937086 1.093977 0.01050172
print(gap_stat_pm)
Clustering Gap statistic ["clusGap"] from call:
clusGap(x = RouteStats_Pca, FUNcluster = pam, K.max = 10, B = 500)
B=500 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
--> Number of clusters (method 'firstSEmax', SE.factor=1): 2
logW E.logW gap SE.sim
[1,] 6.273354 7.317222 1.043868 0.01161238
[2,] 6.140024 7.208149 1.068125 0.01495862
[3,] 6.087381 7.126041 1.038659 0.01414766
[4,] 6.017737 7.076534 1.058796 0.01469932
[5,] 5.991660 7.043313 1.051652 0.01203089
[6,] 5.943989 7.016020 1.072030 0.01149023
[7,] 5.922939 6.992303 1.069363 0.01153694
[8,] 5.888683 6.971776 1.083093 0.01124332
[9,] 5.863290 6.953980 1.090690 0.01107343
[10,] 5.843109 6.937086 1.093977 0.01050172
# hierarchical version
set.seed(123456789)
gap_stat_hcut <- clusGap(RouteStats_Pca,
FUN = hcut,
K.max = 10,
B = 500
)
Clustering k = 1,2,..., K.max (= 10): .. done
Bootstrapping, b = 1,2,..., B (= 500) [one "." per sample]:
.................................................. 50
.................................................. 100
.................................................. 150
.................................................. 200
.................................................. 250
.................................................. 300
.................................................. 350
.................................................. 400
.................................................. 450
.................................................. 500
# Print
print(gap_stat_hcut, method = "Tibs2001SEmax")
Clustering Gap statistic ["clusGap"] from call:
clusGap(x = RouteStats_Pca, FUNcluster = hcut, K.max = 10, B = 500)
B=500 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
--> Number of clusters (method 'Tibs2001SEmax', SE.factor=1): 1
logW E.logW gap SE.sim
[1,] 6.273354 7.317222 1.0438683 0.011612377
[2,] 6.171268 7.204939 1.0336710 0.012770759
[3,] 6.138916 7.130456 0.9915393 0.011444429
[4,] 6.045351 7.075400 1.0300493 0.012108951
[5,] 5.985764 7.041724 1.0559601 0.011094868
[6,] 5.943671 7.013110 1.0694384 0.010703021
[7,] 5.918347 6.988042 1.0696958 0.010276876
[8,] 5.883665 6.965752 1.0820861 0.010056348
[9,] 5.856593 6.945803 1.0892106 0.009895617
[10,] 5.833306 6.927759 1.0944528 0.009736810
print(gap_stat_hcut)
Clustering Gap statistic ["clusGap"] from call:
clusGap(x = RouteStats_Pca, FUNcluster = hcut, K.max = 10, B = 500)
B=500 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
--> Number of clusters (method 'firstSEmax', SE.factor=1): 1
logW E.logW gap SE.sim
[1,] 6.273354 7.317222 1.0438683 0.011612377
[2,] 6.171268 7.204939 1.0336710 0.012770759
[3,] 6.138916 7.130456 0.9915393 0.011444429
[4,] 6.045351 7.075400 1.0300493 0.012108951
[5,] 5.985764 7.041724 1.0559601 0.011094868
[6,] 5.943671 7.013110 1.0694384 0.010703021
[7,] 5.918347 6.988042 1.0696958 0.010276876
[8,] 5.883665 6.965752 1.0820861 0.010056348
[9,] 5.856593 6.945803 1.0892106 0.009895617
[10,] 5.833306 6.927759 1.0944528 0.009736810
# Plot kmeans
fviz_gap_stat(gap_stat_km,
maxSE = list(method = "Tibs2001SEmax")
) # 1 cluster
# Plot pam
fviz_gap_stat(gap_stat_pm,
maxSE = list(method = "Tibs2001SEmax")
) # 2 cluster
# Plot hierarchical
fviz_gap_stat(gap_stat_hcut,
maxSE = list(method = "Tibs2001SEmax")
) # 1 cluster
Clustering. How many clusters are there?
kmeans method with various different statistics.
# str(iris)
nb <- NbClust(RouteStats_Pca, #scale(iris[ ,-5]),
distance = "euclidean",
min.nc = 2,
max.nc = 15,
method = "kmeans",
index = "all"
)
*** : The Hubert index is a graphical method of determining the number of clusters.
In the plot of Hubert index, we seek a significant knee that corresponds to a
significant increase of the value of the measure i.e the significant peak in Hubert
index second differences plot.
*** : The D index is a graphical method of determining the number of clusters.
In the plot of D index, we seek a significant knee (the significant peak in Dindex
second differences plot) that corresponds to a significant increase of the value of
the measure.
*******************************************************************
* Among all indices:
* 7 proposed 2 as the best number of clusters
* 2 proposed 3 as the best number of clusters
* 1 proposed 5 as the best number of clusters
* 1 proposed 6 as the best number of clusters
* 1 proposed 7 as the best number of clusters
* 6 proposed 8 as the best number of clusters
* 2 proposed 9 as the best number of clusters
* 1 proposed 13 as the best number of clusters
* 2 proposed 15 as the best number of clusters
***** Conclusion *****
* According to the majority rule, the best number of clusters is 2
*******************************************************************
fviz_nbclust(nb)
Among all indices:
===================
* 2 proposed 0 as the best number of clusters
* 1 proposed 1 as the best number of clusters
* 7 proposed 2 as the best number of clusters
* 2 proposed 3 as the best number of clusters
* 1 proposed 5 as the best number of clusters
* 1 proposed 6 as the best number of clusters
* 1 proposed 7 as the best number of clusters
* 6 proposed 8 as the best number of clusters
* 2 proposed 9 as the best number of clusters
* 1 proposed 13 as the best number of clusters
* 2 proposed 15 as the best number of clusters
Conclusion
=========================
* According to the majority rule, the best number of clusters is 2 .
Clustering. How many clusters are there?
Hierarchical clustering method. Particularly looking at silhouette statistics.
# Hierarchical clustering, cut in 2 to 15 groups
for(i in 2:15) {
assign(paste0("HCRes_K", i),
eclust(RouteStats_Pca,
"hclust",
k = i,
method = "complete",
graph = FALSE
)
)
assign("x",
get(paste0("HCRes_K", i)
)
)
assign(paste0("HCStats_K", i),
cluster.stats(dist(RouteStats_Scaled,
method ="euclidean"
),
x$cluster
)
)
assign("y",
get(paste0("HCStats_K", i)
)
)
assign(paste0("HCDend_K", i),
fviz_dend(x, rect = TRUE, show_labels = FALSE)
)
assign(paste0("HCSil_K", i),
fviz_silhouette(x)
)
assign(paste0("HCSilWidth_K", i),
as.data.frame(y$clus.avg.silwidths) %>%
mutate(KVal = 1:nrow(.)
)
)
}
replacing previous import by ‘magrittr::%>%’ when loading ‘dendextend’
cluster size ave.sil.width
1 1 234 0.44
2 2 34 0.10
cluster size ave.sil.width
1 1 234 0.40
2 2 33 0.18
3 3 1 0.00
cluster size ave.sil.width
1 1 81 0.32
2 2 153 0.08
3 3 33 0.11
4 4 1 0.00
cluster size ave.sil.width
1 1 81 0.26
2 2 91 0.14
3 3 62 0.09
4 4 33 0.06
5 5 1 0.00
cluster size ave.sil.width
1 1 81 0.16
2 2 91 0.08
3 3 10 0.10
4 4 52 0.25
5 5 33 0.06
6 6 1 0.00
cluster size ave.sil.width
1 1 81 0.16
2 2 91 0.07
3 3 10 0.10
4 4 52 0.25
5 5 29 0.13
6 6 4 0.20
7 7 1 0.00
cluster size ave.sil.width
1 1 81 0.15
2 2 61 0.06
3 3 10 0.08
4 4 52 0.22
5 5 30 0.16
6 6 29 0.05
7 7 4 0.18
8 8 1 0.00
cluster size ave.sil.width
1 1 77 0.21
2 2 61 0.06
3 3 10 0.07
4 4 52 0.19
5 5 30 0.16
6 6 29 0.05
7 7 4 0.15
8 8 4 0.18
9 9 1 0.00
cluster size ave.sil.width
1 1 77 0.21
2 2 61 0.06
3 3 5 0.40
4 4 52 0.19
5 5 30 0.16
6 6 29 0.05
7 7 4 0.14
8 8 5 0.33
9 9 4 0.18
10 10 1 0.00
cluster size ave.sil.width
1 1 77 0.21
2 2 61 0.06
3 3 5 0.40
4 4 52 0.19
5 5 30 0.16
6 6 28 0.08
7 7 4 0.14
8 8 5 0.33
9 9 4 0.18
10 10 1 0.00
11 11 1 0.00
cluster size ave.sil.width
1 1 77 0.21
2 2 61 0.06
3 3 5 0.40
4 4 52 0.19
5 5 30 0.15
6 6 14 0.06
7 7 4 0.14
8 8 14 0.25
9 9 5 0.33
10 10 4 0.18
11 11 1 0.00
12 12 1 0.00
cluster size ave.sil.width
1 1 77 0.20
2 2 34 0.08
3 3 27 0.07
4 4 5 0.40
5 5 52 0.19
6 6 30 0.13
7 7 14 0.06
8 8 4 0.14
9 9 14 0.25
10 10 5 0.33
11 11 4 0.17
12 12 1 0.00
13 13 1 0.00
cluster size ave.sil.width
1 1 30 0.30
2 2 34 0.08
3 3 27 0.07
4 4 5 0.40
5 5 52 0.04
6 6 30 0.13
7 7 14 0.06
8 8 4 0.11
9 9 47 0.01
10 10 14 0.25
11 11 5 0.33
12 12 4 0.17
13 13 1 0.00
14 14 1 0.00
cluster size ave.sil.width
1 1 30 0.30
2 2 34 0.07
3 3 27 0.07
4 4 5 0.40
5 5 52 0.04
6 6 30 0.12
7 7 9 0.18
8 8 4 0.11
9 9 47 0.01
10 10 14 0.19
11 11 5 0.17
12 12 5 0.33
13 13 4 0.13
14 14 1 0.00
15 15 1 0.00
HCSilWidth_AllK <- left_join(select(HCSilWidth_K15,
KVal,
`y$clus.avg.silwidths`
),
HCSilWidth_K14,
by = c("KVal" = "KVal")
) %>%
left_join(.,
HCSilWidth_K13,
by = c("KVal" = "KVal")
) %>%
left_join(.,
HCSilWidth_K12,
by = c("KVal" = "KVal")
) %>%
left_join(.,
HCSilWidth_K11,
by = c("KVal" = "KVal")
) %>%
left_join(.,
HCSilWidth_K10,
by = c("KVal" = "KVal")
) %>%
left_join(.,
HCSilWidth_K9,
by = c("KVal" = "KVal")
) %>%
left_join(.,
HCSilWidth_K8,
by = c("KVal" = "KVal")
) %>%
left_join(.,
HCSilWidth_K7,
by = c("KVal" = "KVal")
) %>%
left_join(.,
HCSilWidth_K6,
by = c("KVal" = "KVal")
) %>%
left_join(.,
HCSilWidth_K5,
by = c("KVal" = "KVal")
) %>%
left_join(.,
HCSilWidth_K4,
by = c("KVal" = "KVal")
) %>%
left_join(.,
HCSilWidth_K3,
by = c("KVal" = "KVal")
) %>%
left_join(.,
HCSilWidth_K2,
by = c("KVal" = "KVal")
)
colnames(HCSilWidth_AllK) <- c("KVal", "K15", "K14", "K13", "K12", "K11", "K10", "K9",
"K8", "K7", "K6", "K5", "K4", "K3", "K2"
)
# Visualize
HCDend_K2
HCDend_K3
HCDend_K4
HCDend_K5
HCDend_K6
HCDend_K7
HCDend_K8
HCDend_K9
HCDend_K10
HCDend_K11
HCDend_K12
HCDend_K13
HCDend_K14
HCDend_K15
HCSil_K2
HCSil_K3
HCSil_K4
HCSil_K5
HCSil_K6
HCSil_K7
HCSil_K8
HCSil_K9
HCSil_K10
HCSil_K11
HCSil_K12
HCSil_K13
HCSil_K14
HCSil_K15
HCSilWidth_AllK
With Hierarchical Clustering and k=2, these are the routes in each cluster.
HC_K2 <- eclust(RouteStats_Pca,
"hclust",
k = 2,
method = "complete",
graph = FALSE
)
str(HC_K2)
List of 12
$ merge : int [1:267, 1:2] -92 -56 -165 -73 -117 -108 -134 -217 -102 -168 ...
$ height : num [1:267] 0.882 1.01 1.09 1.095 1.188 ...
$ order : int [1:268] 202 210 39 160 81 92 103 117 221 177 ...
$ labels : chr [1:268] "10A" "10B" "10E" "11Y" ...
$ method : chr "ward.D2"
$ call : language stats::hclust(d = x, method = hc_method)
$ dist.method: chr "euclidean"
$ cluster : Named int [1:268] 1 1 1 1 1 1 1 1 1 1 ...
..- attr(*, "names")= chr [1:268] "10A" "10B" "10E" "11Y" ...
$ nbclust : num 2
$ silinfo :List of 3
..$ widths :'data.frame': 268 obs. of 3 variables:
.. ..$ cluster : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
.. ..$ neighbor : num [1:268] 2 2 2 2 2 2 2 2 2 2 ...
.. ..$ sil_width: num [1:268] 0.588 0.585 0.582 0.581 0.578 ...
..$ clus.avg.widths: num [1:2] 0.439 0.102
..$ avg.width : num 0.397
$ size : int [1:2] 234 34
$ data :'data.frame': 268 obs. of 15 variables:
..$ PC1 : num [1:268] -2.143 -3.5 0.996 2.215 -1.33 ...
..$ PC2 : num [1:268] -0.34 -0.708 2.136 2.78 2.115 ...
..$ PC3 : num [1:268] -0.2398 -0.0136 -0.1131 1.8714 0.3826 ...
..$ PC4 : num [1:268] -0.441 -0.741 -0.231 -1.692 0.286 ...
..$ PC5 : num [1:268] 0.455 0.137 -1.133 -0.39 -0.422 ...
..$ PC6 : num [1:268] -0.542 -0.651 0.384 0.335 0.856 ...
..$ PC7 : num [1:268] -0.248 -0.457 -1.26 -1.574 0.945 ...
..$ PC8 : num [1:268] 0.435 0.373 1.788 0.854 0.745 ...
..$ PC9 : num [1:268] -0.334 -0.845 0.458 -0.891 -0.619 ...
..$ PC10: num [1:268] 0.207 0.69 -1.068 -0.403 -0.095 ...
..$ PC11: num [1:268] -0.72 -0.321 0.904 0.47 0.458 ...
..$ PC12: num [1:268] -0.347 -0.6553 0.3871 -0.0855 0.0615 ...
..$ PC13: num [1:268] 0.1573 -0.0503 0.0605 0.4991 -0.2338 ...
..$ PC14: num [1:268] -0.0734 -0.1431 0.1695 0.0626 0.5568 ...
..$ PC15: num [1:268] -0.1296 -0.1389 -0.1039 -0.3495 -0.0226 ...
- attr(*, "class")= chr [1:3] "hclust" "hcut" "eclust"
HC_K2_Clusters <- as.data.frame(HC_K2$cluster) %>%
rename(ClusterNum = `HC_K2$cluster`) %>%
mutate(BusRoute = rownames(.)
) %>%
arrange(ClusterNum,
BusRoute
)
HC_K2_Clusters
group_by(HC_K2_Clusters,
ClusterNum
) %>%
summarise(Cnt = n()
)
Using kmeans, PAM, and Hierarchical clustering methods, we can say we probably have 2 clusters.
Let’s try density clustering. (This tends to show that maybe there is only one “cluster,” meaning that data are not clusterable.)
rm(list = ls(pattern = "_K")
)
# Compute DBSCAN using fpc package
kNNdistplot(RouteStats_Pca, k = 10)
abline(h = 8.5, lty = 2)
set.seed(123456789)
db <- fpc::dbscan(RouteStats_Pca,
eps = 8.5,
MinPts = 10
)
str(db)
List of 4
$ cluster: num [1:268] 1 1 1 1 1 1 1 1 1 1 ...
$ eps : num 8.5
$ MinPts : num 10
$ isseed : logi [1:268] TRUE TRUE TRUE TRUE TRUE TRUE ...
- attr(*, "class")= chr "dbscan"
db
dbscan Pts=268 MinPts=10 eps=8.5
0 1
border 5 7
seed 0 256
total 5 263
# Plot DBSCAN results
fviz_cluster(db,
RouteStats_Pca,
stand = FALSE,
frame = FALSE,
geom = "point"
)
argument frame is deprecated; please use ellipse instead.
We can say that MAYBE there are two clusters, but there is more evidence for probably just one cluster (i.e., the data are NOT clusterable).
# remove no longer needed items
rm(X2_Long, X2_Pct, ClustData_Ends, db, gap_stat, gap_stat_hcut, gap_stat_km, gap_stat_pm, i, nb, rd, Trnsfrm, x, y, BusRoute, Rte, map, WaitTime_AllBus_Zip_Box, WaitTime_AllBus_Zip_Violin, X2_WaitByHr_Line)
rm(list = ls(pattern = "Count")
)
rm(list = ls(pattern = "RoutStop_")
)
rm(list = ls(pattern = "TimeBtw")
)
rm(list = ls(pattern = "PcaRes")
)
Investigating TravelTime_Sec.
View(filter(TTLargeRteChng,
!is.na(TravelTime_Sec) &
RteChange2 == "Same"
) %>%
arrange(desc(TravelTime_Sec),
SpeedAvg_Mph_NewHvrs
) %>%
head(500)
)
# examples where TravelTime_Sec is small (1 sec) and SpeedAvg_Mph_NewHvrs is large.
View(select(NewTravTime,
# -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
-(TD_Mi_q2:TD_Mi_SSHG_Cnt_F),
-(TT_Hr_q2:TT_Hr_SSHG_Cnt_F)
) %>%
filter((RowNum_OG >= 2217353 & RowNum_OG <= 2217373) | # 2217363
(RowNum_OG >= 3090321 & RowNum_OG <= 3090341) | # 3090331
(RowNum_OG >= 80764 & RowNum_OG <= 80784) | # 80774
(RowNum_OG >= 33840 & RowNum_OG <= 33860) # 33850
)
)
# examples where TravelTime_Sec is large and SpeedAvg_Mph_NewHvrs is small.
View(filter(TTLargeRteChng,
(RowNum_OG >= 2250290 & RowNum_OG <= 2250310) | # 2250300
(RowNum_OG >= 867717 & RowNum_OG <= 867737) | # 867727
(RowNum_OG >= 864379 & RowNum_OG <= 864399) | # 864389
(RowNum_OG >= 808395 & RowNum_OG <= 808415) # 808405
)
)
# examples where TravelTime_Sec is unusually small (with TravelDistance_Mi values that are large).
View(filter(AllDays_NewTravelDist,
(RowNum_OG >= 1042228 & RowNum_OG <= 1042248) | # 1042238
(RowNum_OG >= 53816 & RowNum_OG <= 53836) | # 53826
(RowNum_OG >= 360571 & RowNum_OG <= 360591) | # 360581
(RowNum_OG >= 502271 & RowNum_OG <= 502291) # 502281 (can't explian the weird TravelTime_Sec calculation here - it's not even an integer!)
)
)
# still trying to explain 502281...on the day of this weirdness, the bus was only in circulation for 4-5 stops (~20 minutes) on that day (Oct 6)
View(filter(AllDays_NewTravelDist,
Bus_ID == 2711
)
)
# exploring large values for TravelTime_Sec
View(filter(AllDays_NewTravelDist,
TravelTime_Sec == 300
) %>%
arrange(desc(TravelTime_Sec),
SpeedAvg_Mph2
)
)
# examples where TravelTime_Sec is unusually large (with TravelDistance_Mi values that are small, so SpeedAvg_Mph values are very small).
View(filter(AllDays_NewTravelDist,
(RowNum_OG >= 2627459 & RowNum_OG <= 2627479) | # 2627469
(RowNum_OG >= 2193344 & RowNum_OG <= 2193364) | # 2193354
(RowNum_OG >= 1644123 & RowNum_OG <= 1644143) | # 1644133
(RowNum_OG >= 869600 & RowNum_OG <= 869620) # 869610
)
)
Investigation of SpeedAvg_Mph2
View(Speed_Pctiles): 90% of SpeedAvg_Mph2 are between ~3mph and ~66mph.
Speed_Ntile <- as.data.frame(AllDays_NewTravelDist$SpeedAvg_Mph2) %>%
mutate(Pctile = ntile(AllDays_NewTravelDist$SpeedAvg_Mph2, 100),
MinR = min_rank(AllDays_NewTravelDist$SpeedAvg_Mph2),
PctR = percent_rank(AllDays_NewTravelDist$SpeedAvg_Mph2),
PctR_Round = round(PctR, 2)
)
colnames(Speed_Ntile)[1] <- "SpeedAvg_Mph2"
str(Speed_Ntile)
Speed_Ntile_Rows <- nrow(Speed_Ntile)
View(tail(Speed_Ntile, 500))
Speed_Pctiles <- group_by(Speed_Ntile,
PctR_Round
) %>%
summarise(
MinSpeedAtPctile = min(SpeedAvg_Mph2),
CntsAtPctile = n(),
PctsAtPctile = CntsAtPctile / Speed_Ntile_Rows
) %>%
mutate(CumSumPAtP = cumsum(PctsAtPctile)
)
View(Speed_Pctiles)
Investigation of SpeedAvg_Mph2.
Exploring the removal of outlier TravelTime_Sec and TravelDistance_Mi.
summary(select(AllDays_NewTravelDist,
SpeedAvg_Mph,
SpeedAvg_Mph2
)
)
summary(select(filter(AllDays_NewTravelDist,
TravelDistance_Mi > 0.0001893939 & # lowest non-zero percentile
TravelDistance_Mi < 1.0812500000 & # 99th percentile
TravelTime_Sec > 10.050000 & # 2nd percentile
TravelTime_Sec < 293.000000 # 98th percentile
),
SpeedAvg_Mph,
SpeedAvg_Mph2
)
)
Investigation of SpeedAvg_Mph2.
Histogram of SpeedAvg_Mph2.
Speed_HistDen <- ggplot(filter(AllDays_NewTravelDist,
!is.na(SpeedAvg_Mph2)
),
aes(x = SpeedAvg_Mph2,
y = ..density..
)
) +
geom_histogram(binwidth = 5, fill = "lightblue", colour = "grey60", size = 0.2) +
geom_line(stat = "density", colour = "red") +
stat_bin(binwidth = 5,
geom = "text",
size = 2.5,
vjust = 1.5,
aes(label = format(..count.., big.mark = ",")
),
) +
# geom_text(aes(label = format(..count.., big.mark = ",")
# ),
# size = 3,
# nudge_y = (..count.. * 0.1)
# ) +
coord_cartesian(xlim = c(0, 70), ylim = c(0, 0.04)
) +
# theme(legend.position="none") +
labs(title = "Variation in Travel Speed",
x = "Average Speed (mph)",
y = "Density"
)
Speed_HistDen
Investigation of SpeedAvg_Mph2.
Histogram of SpeedAvg_Mph2 after removing outlier TravelTime_Sec and TravelDistance_Mi.
View(TravDistMiNew_Pctiles)
View(TravTimeHr_Pctiles)
SpeedNoOutlier_HistDen <- ggplot(filter(AllDays_NewTravelDist,
!is.na(SpeedAvg_Mph2) &
TravelDistance_Mi_New > 0.077841005 & # 5th percentile
# TravelDistance_Mi_New < 1.0812500000 & # 99th percentile
TravelTime_Sec > 12.100000 # 4th percentile
# TravelTime_Sec < 293.000000 # 98th percentile
),
aes(x = SpeedAvg_Mph2,
y = ..density..
)
) +
geom_histogram(binwidth = 5, fill = "lightblue", colour = "grey60", size = 0.2) +
geom_line(stat = "density", colour = "red") +
stat_bin(binwidth = 5,
geom = "text",
size = 2.5,
vjust = 1.5,
aes(label = format(..count.., big.mark = ",")
),
) +
# geom_text(aes(label = format(..count.., big.mark = ",")
# ),
# size = 3,
# nudge_y = (..count.. * 0.1)
# ) +
coord_cartesian(xlim = c(0, 70), ylim = c(0, 0.04)
) +
# theme(legend.position="none") +
labs(title = "Variation in Travel Speed",
subtitle = "(removed low outliers of Travel Distance and Travel Time)",
x = "Average Speed (mph)",
y = "Density"
)
SpeedNoOutlier_HistDen
Investigation of SpeedAvg_Mph2.
New dataset (NoOutliers_TravelDistNTime) when removing outlier low values of TravelDistance_Mi_New and TravelTime_Sec.
View(TravDistMiNew_Pctiles)
View(TravTimeHr_Pctiles)
NoOutliers_TravelDistNTime <- filter(AllDays_NewTravelDist,
TravelDistance_Mi_New > .077841005 & # 5th percentile
# TravelDistance_Mi_New < 1.0812500000 & # 99th percentile
TravelTime_Sec > 12.100000 # 4th percentile
# TravelTime_Sec < 293.000000 # 98th percentile
)
nrow(AllDays_NewTravelDist) - nrow(NoOutliers_TravelDistNTime)
str(NoOutliers_TravelDistNTime)
summary(NoOutliers_TravelDistNTime)
Investigation of SppedAvg_Mph2.
View(Speed_NoOut_Pctiles): Aproximately 90% of SpeedAvg_Mph2 values are between ~4mph and ~56mph.
Speed_NoOut_Ntile <- as.data.frame(NoOutliers_TravelDistNTime$SpeedAvg_Mph2) %>%
mutate(Pctile = ntile(NoOutliers_TravelDistNTime$SpeedAvg_Mph2, 100),
MinR = min_rank(NoOutliers_TravelDistNTime$SpeedAvg_Mph2),
PctR = percent_rank(NoOutliers_TravelDistNTime$SpeedAvg_Mph2),
PctR_Round = round(PctR, 2)
)
colnames(Speed_NoOut_Ntile)[1] <- "SpeedAvg_Mph2"
str(Speed_NoOut_Ntile)
Speed_NoOut_Ntile_Rows <- nrow(Speed_NoOut_Ntile)
View(tail(Speed_NoOut_Ntile, 500))
Speed_NoOut_Pctiles <- group_by(Speed_NoOut_Ntile,
PctR_Round
) %>%
summarise(
MinSpeedAtPctile = min(SpeedAvg_Mph2),
CntsAtPctile = n(),
PctsAtPctile = CntsAtPctile / Speed_NoOut_Ntile_Rows
) %>%
mutate(CumSumPAtP = cumsum(PctsAtPctile)
)
View(Speed_NoOut_Pctiles)
Investigation of SppedAvg_Mph2.
Exloring odd/impossible values.
# Exploring when SpeedAvg_Mph2 is NA -- does not occur at all
nrow(filter(NoOutliers_TravelDistNTime,
is.na(SpeedAvg_Mph2)
)
)
# Exploring when SpeedAvg_Mph2 is zero -- does not occur at all
nrow(filter(NoOutliers_TravelDistNTime,
SpeedAvg_Mph2 == 0
)
)
# examples where SpeedAvg_Mph2 < 3.2848770
View(filter(AllDays_NewTravelDist,
SpeedAvg_Mph2 > 0 &
SpeedAvg_Mph2 < 3.2848770
) %>%
arrange(SpeedAvg_Mph2)
)
# examples where SpeedAvg_Mph2 < 3.2848770
View(filter(AllDays_NewTravelDist,
(RowNum_OG >= 485338 & RowNum_OG <= 485358) | # 485348 -- Extreme travel time, Route Change
(RowNum_OG >= 346952 & RowNum_OG <= 346972) | # 346962 -- Extreme travel time, Route Change
(RowNum_OG >= 70494 & RowNum_OG <= 70514) | # 70504 -- Extreme travel time, Route Change
(RowNum_OG >= 2051846 & RowNum_OG <= 2051866) # 2051856 -- Extreme travel time, Route Change
)
)
Investigation of SpeedAvg_Mph2.
Limit the dataset based on SpeedAvg_Mph2.
NoOutliersSpeed <- filter(NoOutliers_TravelDistNTime,
between(SpeedAvg_Mph2,
4.069300, # 5th percentile
56.05651 #95th percentile
)
)
nrow(NoOutliers_TravelDistNTime) - nrow(NoOutliersSpeed)
summary(NoOutliersSpeed)
TravelTime now looks like it has some odd values on the high end. So let’s look at those.
View(TravTime_NoOut_Pctiles): Virtually all trips should take less than 5 minutes. (The 99th percentile of of TravelTime is approximately 8 minutes.)
TravTime_NoOut_Ntile <- as.data.frame(NoOutliersSpeed$TravelTime_Hr) %>%
mutate(Pctile = ntile(NoOutliersSpeed$TravelTime_Hr, 100),
MinR = min_rank(NoOutliersSpeed$TravelTime_Hr),
PctR = percent_rank(NoOutliersSpeed$TravelTime_Hr),
PctR_Round = round(PctR, 2)
)
colnames(TravTime_NoOut_Ntile)[1] <- "TravelTime_Hr"
str(TravTime_NoOut_Ntile)
TravTime_NoOut_Ntile_Rows <- nrow(TravTime_NoOut_Ntile)
View(tail(TravTime_NoOut_Ntile, 500))
TravTime_NoOut_Pctiles <- group_by(TravTime_NoOut_Ntile,
PctR_Round
) %>%
summarise(
MinTravTimeHrAtPctile = min(TravelTime_Hr),
CntsAtPctile = n(),
PctsAtPctile = CntsAtPctile / TravTime_NoOut_Ntile_Rows
) %>%
mutate(CumSumPAtP = cumsum(PctsAtPctile),
MinTravTimeSecAtPctile = MinTravTimeHrAtPctile * (60 * 60)
)
View(TravTime_NoOut_Pctiles)
Investigating odd TravelTime_Sec values.
Trips longer than ~8 minutes.
View(filter(NoOutliersSpeed,
TravelTime_Sec > 491 # min at the 100th percentile
) %>%
arrange(desc(TravelTime_Sec)
)
)
# examples of TravelTime_Sec values that are largest.
View(filter(NoOutliersSpeed,
(RowNum_OG >= 2071759 & RowNum_OG <= 2071779) | # 2071769 -- results from a route change, and a 3hr+ wait before the new route starts
(RowNum_OG >= 1473686 & RowNum_OG <= 1473706) | # 1473696 -- results from a route change, and a 3hr wait before the new route starts
(RowNum_OG >= 1222822 & RowNum_OG <= 1222842) | # 1222832 -- results from a route change, and a 3hr wait before the new route starts
(RowNum_OG >= 3046089 & RowNum_OG <= 3046109) # 3046099 -- results from a route change, and a 3hr wait before the new route starts
)
)
# examples of TravelTime_Sec values that are the smallest of the large.
View(filter(NoOutliersSpeed,
(RowNum_OG >= 3044689 & RowNum_OG <= 3044709) | # 3044699 -- results from a route change
(RowNum_OG >= 3022358 & RowNum_OG <= 3022378) | # 3022368 -- results from a route change
(RowNum_OG >= 2993016 & RowNum_OG <= 2993036) | # 2993026 -- results from a previous route change (change occurred in deleted row)
(RowNum_OG >= 2683703 & RowNum_OG <= 2683723) # 2683713 -- results from a previous route change (change occurred in deleted row)
)
)
Let’s look at the TravelTime_Sec values and route changes (DirChange2).
The 99th percentile of TravelTime_Sec for both, all trips, and just those trips NOT involving route changes (DirChange2 = “Same”), is approximately 5min (300 sec).
Nota Bene: The percentile calculation here is defined slightly different than in most of the above analyses (which get the lowest value in the bin created by 100 ntiles).
summary(select(NoOutliersSpeed,
TravelTime_Sec
)
)
summary(select(filter(NoOutliersSpeed,
DirChange2 == "Same"
),
TravelTime_Sec
)
)
summary(select(filter(NoOutliersSpeed,
DirChange2 == "Change"
),
TravelTime_Sec
)
)
TravTimeSec_Qtiles_df <- data.frame(PctValue = seq(0, 100, 1),
All = seq(1, 101, 1),
Same = seq(1, 101, 1),
Change = seq(1, 101, 1)
)
TravTimeSec_Qtiles_df[ , 2] <- quantile(select(NoOutliersSpeed,
TravelTime_Sec
),
probs = seq(0, 1, 0.01),
na.rm = TRUE
)
TravTimeSec_Qtiles_df[ , 3] <- quantile(select(filter(NoOutliersSpeed,
DirChange2 == "Same"
),
TravelTime_Sec
),
probs = seq(0, 1, 0.01),
na.rm = TRUE
)
TravTimeSec_Qtiles_df[ , 4] <- quantile(select(filter(NoOutliersSpeed,
DirChange2 == "Change"
),
TravelTime_Sec
),
probs = seq(0, 1, 0.01),
na.rm = TRUE
)
View(TravTimeSec_Qtiles_df)
Limit the dataset now based on TravelTime_Sec.
UpperLimitTravTime <- filter(NoOutliersSpeed,
TravelTime_Sec <= 491 # min at the 100th percentile
)
nrow(NoOutliersSpeed) - nrow(UpperLimitTravTime)
str(UpperLimitTravTime)
summary(UpperLimitTravTime)
Investigation of Dwell_Time2 (how long the bus is at a stop).
Differences between Dwell_Time (by WMATA) and Dwell_Time2 (by me) appear to be due to switches in RouteAlt. WMATA calculates Dwell_Time by an unknown process. The WMATA calculation is equal to my calculation, except for the records immedaitely before and after a RouteAlt switch (DirChange2).
View(filter(AllDays_NewOrder,
Dwell_Time != Dwell_Time2
)
)
# Examples where the Dwell_Time and Dwell_Time2 are different
View(filter(AllDays_NewOrder,
( (RowNum_OG >= 65 & RowNum_OG <= 85) | # 75
(RowNum_OG >= 162 & RowNum_OG <= 192) | # 172
(RowNum_OG >= 431952 & RowNum_OG <= 431972) | # 431962
(RowNum_OG >= 434595 & RowNum_OG <= 434615) # 434605 -- this record is NOT a route switch, but does has a Sequence switch (Me: should there really be a route switch here?)
)
)
)
Investigation of Dwell_Time2 (how long the bus is at a stop).
First, create some “rank” stats. View(DT2_Pctiles): 95% of Dwell_Time2s are <= 23 seconds…but some weird (e.g., nearly 2 hour Dwell_Time2s exist).
DwellTime2_Ntile <- as.data.frame(AllDays_NewOrder$Dwell_Time2) %>%
mutate(Pctile = ntile(AllDays_NewOrder$Dwell_Time2, 100),
MinR = min_rank(AllDays_NewOrder$Dwell_Time2),
PctR = percent_rank(AllDays_NewOrder$Dwell_Time2),
PctR_Round = round(PctR, 2)
)
colnames(DwellTime2_Ntile)[1] <- "Dwell_Time2"
str(DwellTime2_Ntile)
DwellTime2_Ntile_Rows <- nrow(DwellTime2_Ntile)
View(tail(DwellTime2_Ntile, 500))
DwellTime2_Pctiles <- group_by(DwellTime2_Ntile,
PctR_Round
) %>%
summarise(
MinDwellAtPctile = min(Dwell_Time2),
CntsAtPctile = n(),
PctsAtPctile = CntsAtPctile / DwellTime2_Ntile_Rows
) %>%
mutate(CumSumPAtP = cumsum(PctsAtPctile)
)
View(DwellTime2_Pctiles)
Investigation of Dwell_Time2 (how long the bus is at a stop).
Histogram of Dwell_Time2.
DwellTime2_HistDen <- ggplot(AllDays_NewOrder, aes(x = Dwell_Time2, y = ..density..)) +
geom_histogram(binwidth = 1, fill = "lightblue", colour = "grey60", size = 0.2) +
geom_line(stat = "density", colour = "red") +
coord_cartesian(xlim = c(1, 25), ylim = c(0, 0.05)
) +
xlab("Time a Bus Stays at a Stop (sec)") +
ylab("Density") +
# theme(legend.position="none") +
ggtitle(expression(atop("Variation in How Long a Bus Stays at a Stop"
# ,atop(italic("xxxxx"),"")
)
)
)
DwellTime2_HistDen
Investigation of Dwell_Time2 (how long the bus is at a stop).
Looking at some weirdly long Dwell_Time2 values.
View(arrange(AllDays_NewOrder,
desc(Dwell_Time2)
)
)
# examples of extremely large Dwell_Time2s
View(filter(AllDays_NewOrder,
(RowNum_OG >= 292669 & RowNum_OG <= 292689) | # 292679
(RowNum_OG >= 531057 & RowNum_OG <= 531077) | # 531067
(RowNum_OG >= 1388627 & RowNum_OG <= 1388647) | # 1388637
(RowNum_OG >= 1645711 & RowNum_OG <= 1645731) # 1645721
)
)
View(filter(AllDays_NewOrder,
Dwell_Time2 == 0
)
)
Investigation of Delta_Time (how early or late the bus is).
View(DT2_Pctiles): 94% of Delta_Time values are between -236 seconds and 1,259 seconds. Roughly 66% of records are within 5 min late and 5 min early…but some weird (e.g., almost 50 minute late or 40 minute early) Delta_Times exist.
Note that Delta_Time is the difference from the scheduled bus arrival. So if two buses are scheduled to arrive at a destination at 10:00pm and 10:20pm, and if the 10:20pm bus has a Delta_Time of 5 minutes, there are 25 minutes between bus arrivals at the stop.
Also note that based on a comment at https://planitmetro.com/2016/11/16/data-download-metrobus-vehicle-location-data/, the Delta_Time values don’t appear to coincide with published bus schedules (e.g., the X2 departing every 8 minutes during peak hours).
DeltTime_Ntile <- as.data.frame(AllDays_NewOrder$Delta_Time) %>%
mutate(Pctile = ntile(AllDays_NewOrder$Delta_Time, 100),
MinR = min_rank(AllDays_NewOrder$Delta_Time),
PctR = percent_rank(AllDays_NewOrder$Delta_Time),
PctR_Round = round(PctR, 2)
)
colnames(DeltTime_Ntile)[1] <- "Delta_Time"
str(DeltTime_Ntile)
DeltTime_Ntile_Rows <- nrow(DeltTime_Ntile)
View(tail(DeltTime_Ntile, 500))
DeltTime_Pctiles <- group_by(DeltTime_Ntile,
PctR_Round
) %>%
summarise(
MinDeltTimeAtPctile = min(Delta_Time),
CntsAtPctile = n(),
PctsAtPctile = CntsAtPctile / DeltTime_Ntile_Rows
) %>%
mutate(CumSumPAtP = cumsum(PctsAtPctile)
)
View(DeltTime_Pctiles)
DeltTime_Pctiles
# ~66% of rows are between 5 min late and 5 min early
nrow(filter(AllDays_NewOrder,
Delta_Time >= -300 &
Delta_Time <= 300
)
) / nrow(AllDays_NewOrder)
# examples of weird large Delta_Times
View(filter(AllDays_NewOrder,
Delta_Time < -4202 |
Delta_Time > 1705
) %>%
arrange(desc(Delta_Time)
)
)
Investigation of Delta_Time (how early or late the bus is).
Delta_Time histogram.
DeltTime_HistDen <- ggplot(AllDays_NewOrder, aes(x = (Delta_Time / 60),
y = ..density..
)
) +
geom_histogram(binwidth = (5/60), fill = "lightblue", colour = "grey60", size = 0.2) +
geom_line(stat = "density", colour = "red") +
coord_cartesian(xlim = c(-5, 5)) +
xlab("Bus Lateness (min)") +
ylab("Density") +
# theme(legend.position="none") +
ggtitle(expression(atop("Variation in How Early/Late a Bus Is",
atop(italic("(positive values are late arrivals)"),
""
)
)
)
)
DeltTime_HistDen
Investigation of Delta_Time (how early or late the bus is).
Delta_Time boxplot.
# Count_Values is needed to display the medians on the box plots
Count_Values <- ddply(AllDays_NewOrder,
.(Event_Time_HrGroup),
summarise,
Value_Counts = median(Delta_Time / 60, na.rm = TRUE)
)
DeltTime_BoxPlot <- ggplot(AllDays_NewOrder,
aes(factor(Event_Time_HrGroup),
Delta_Time / 60,
fill = factor(Event_Time_HrGroup)
)
) +
geom_boxplot(outlier.colour="red", notch=TRUE) +
# coord_cartesian(ylim = c(-300, 1200)) +
coord_cartesian(ylim = c(-5, 20)) +
geom_text(data = Count_Values,
aes(y = Value_Counts,
label = format(round(Value_Counts, digits = 1),
nsmall = 1
)
),
size = 3,
vjust = -0.5
) +
xlab("Hour Group") +
ylab("Bus Lateness (minutes)") +
theme(legend.position="none", axis.text.x = element_text(angle=45)) +
#theme(legend.position="right", axis.text.x = element_blank()) +
ggtitle(expression(atop("How Early/Late is the Bus (by Hour Group)",
atop(italic("(positive values are late arrivals)"),
""
)
)
)
)
DeltTime_BoxPlot
Investigation of Delta_Time (how early or late the bus is).
Exploring “extreme” Delta_Times. First let’s get some “rank” stats.
View(DeltTime_Pctiles)
DeltTime_Pctiles
DeltTimeAbs_Ntile <- as.data.frame(abs(AllDays_NewOrder$Delta_Time)) %>%
mutate(Pctile = ntile(abs(AllDays_NewOrder$Delta_Time), 100),
MinR = min_rank(abs(AllDays_NewOrder$Delta_Time)),
PctR = percent_rank(abs(AllDays_NewOrder$Delta_Time)),
PctR_Round = round(PctR, 2)
)
colnames(DeltTimeAbs_Ntile)[1] <- "Delta_Time_Abs"
str(DeltTimeAbs_Ntile)
DeltTimeAbs_Ntile_Rows <- nrow(DeltTimeAbs_Ntile)
View(tail(DeltTimeAbs_Ntile, 500))
DeltTimeAbs_Pctiles <- group_by(DeltTimeAbs_Ntile,
PctR_Round
) %>%
summarise(
MinDeltTimeAtPctile = min(Delta_Time_Abs),
CntsAtPctile = n(),
PctsAtPctile = CntsAtPctile / DeltTime_Ntile_Rows
) %>%
mutate(CumSumPAtP = cumsum(PctsAtPctile)
)
View(DeltTimeAbs_Pctiles)
DeltTimeAbs_Pctiles
Investigation of Delta_Time (how early or late the bus is).
Exploring “extreme” Delta_Times. Then let’s calculate the percentage of buses that are 10 minutes (or more) late/early.
HrGroup_DeltaTime_All <- group_by(AllDays_NewOrder,
Event_Time_HrGroup
) %>%
summarise(EventAll_Cnt = n()
)
str(HrGroup_DeltaTime_All)
View(HrGroup_DeltaTime_All)
HrGroup_DeltaTime_Above10Min <- filter(AllDays_NewOrder,
abs(Delta_Time) >= 600
) %>%
group_by(Event_Time_HrGroup) %>%
summarise(EventAbove10_Cnt = n()
)
str(HrGroup_DeltaTime_Above10Min)
View(HrGroup_DeltaTime_Above10Min)
HrGroup_DeltaTimeCompare <- inner_join(HrGroup_DeltaTime_Above10Min,
HrGroup_DeltaTime_All,
by = c("Event_Time_HrGroup" = "Event_Time_HrGroup")
) %>%
mutate(PctEventsAbove10 = EventAbove10_Cnt / EventAll_Cnt)
View(HrGroup_DeltaTimeCompare)
Investigation of Delta_Time (how early or late the bus is).
Quickly plot these “extreme” Delta_Times.
DeltTime_Above10_Cols <- ggplot(HrGroup_DeltaTimeCompare,
aes(factor(Event_Time_HrGroup),
PctEventsAbove10
)
) +
geom_col(fill = "lightblue", colour = "grey60", size = 0.2) +
geom_text(aes(label = format(round(PctEventsAbove10, digits = 2),
nsmall = 2
)
),
size = 3,
nudge_y = (HrGroup_DeltaTimeCompare$PctEventsAbove10 * -0.1)
) +
# coord_cartesian(xlim = c(-5, 5)) +
xlab("Hour Group") +
ylab("Percent of All Bus Arrivals") +
theme(legend.position="none", axis.text.x = element_text(angle=45)) +
ggtitle(expression(atop("When is a Bus 10+ Minutes Late/Early"
# ,atop(italic("positive values are late arrivals"),
# ""
# )
)
)
)
DeltTime_Above10_Cols
Quick investigation on the relationship between Dwell_Time2 (the time a bus is at a stop) and Delta_Time (how early/late the bus is).
Correlation.
DwellTDeltaT_Corr <- as.matrix(cor(x = AllDays_NewOrder$Dwell_Time2,
y = AllDays_NewOrder$Delta_Time,
use = "pairwise"
)
)
DwellTDeltaT_Corr
Quick investigation on the relationship between Dwell_Time2 (the time a bus is at a stop) and Delta_Time (how early/late the bus is).
Next, let’s get a sample of data for plotting. Let’s do this for the full dataset (AllDays_NewOrder).
AllDays_NewOrder_10PctSamp <- sample_frac(AllDays_NewOrder, 0.1) %>%
select(Delta_Time,
Dwell_Time2
) %>%
mutate(DataSet = "AllData")
str(AllDays_NewOrder_10PctSamp)
Quick investigation on the relationship between Dwell_Time2 (the time a bus is at a stop) and Delta_Time (how early/late the bus is).
Let’s also get a sample of data for plotting, but with a datset that removes outliers.
View(DeltTime_Pctiles)
View(DwellTime2_Pctiles)
AllDays_NewOrder_NoExtremes_10PctSamp <- filter(AllDays_NewOrder,
between(Delta_Time, -402, 1705) & # removes about 2% of Delta_Time values
between(Dwell_Time2, 1, 63) # removes about 2% of Dwell_Time2 values
) %>%
sample_frac(0.1) %>%
select(Delta_Time,
Dwell_Time2
) %>%
mutate(DataSet = "OutliersRemoved")
str(AllDays_NewOrder_NoExtremes_10PctSamp)
Quick investigation on the relationship between Dwell_Time2 (the time a bus is at a stop) and Delta_Time (how early/late the bus is).
Plotting the data from the dataset that does not remove outliers.
DwellTDeltaT_Scatter <- ggplot(AllDays_NewOrder_10PctSamp,
aes(Dwell_Time2, Delta_Time)
) +
geom_point(shape = 1, alpha = 0.5) +
scale_shape(solid = FALSE) +
geom_smooth(method = "lm", colour = "red") +
# xlab("Time at Stop (sec)") +
# ylab("Lateness (sec)") +
annotate(label = lm_eqn(df = AllDays_NewOrder_10PctSamp,
y = AllDays_NewOrder_10PctSamp$Delta_Time,
x = AllDays_NewOrder_10PctSamp$Dwell_Time2
),
x = 2200,
y = 600,
geom = "text",
size = 3,
colour = "red",
parse = TRUE
) +
labs(title = "Lateness vs Time at Stop",
subtitle = "(no outliers removed)",
x = "Time at Stop (sec)",
y = "Lateness (sec)"
)
# ggtitle(expression(atop("Lateness vs Time at Stop"
# ,atop(italic("(no outliers removed)"),
# ""
# )
# )
# )
# )
# +
# geom_jitter()
DwellTDeltaT_Scatter
Quick investigation on the relationship between Dwell_Time2 (the time a bus is at a stop) and Delta_Time (how early/late the bus is).
Plotting the data from the dataset that does remove outliers.
DwellTDeltaT_Scatter_NoExtremes <- ggplot(AllDays_NewOrder_NoExtremes_10PctSamp,
aes(Dwell_Time2, Delta_Time)
) +
geom_point(shape = 1, alpha = 0.5) +
scale_shape(solid = FALSE) +
geom_smooth(method = "lm", colour = "blue") +
# xlab("Time at Stop (sec)") +
# ylab("Lateness (sec)") +
annotate(label = lm_eqn(df = AllDays_NewOrder_NoExtremes_10PctSamp,
y = AllDays_NewOrder_NoExtremes_10PctSamp$Delta_Time,
x = AllDays_NewOrder_NoExtremes_10PctSamp$Dwell_Time2
),
x = 50,
y = -475,
geom = "text",
size = 3,
colour = "blue",
parse = TRUE
) +
labs(title = "Lateness vs Time at Stop",
subtitle = "(2% of outliers removed)",
x = "Time at Stop (sec)",
y = "Lateness (sec)"
)
# ggtitle(expression(atop("Lateness vs Time at Stop"
# ,atop(italic("(2% of outliers removed)"),
# ""
# )
# )
# )
# )
# +
# geom_jitter()
DwellTDeltaT_Scatter_NoExtremes
Quick investigation on the relationship between Dwell_Time2 (the time a bus is at a stop) and Delta_Time (how early/late the bus is).
Plotting the data from both datasets together.
CombinedData <- rbind(AllDays_NewOrder_10PctSamp,
AllDays_NewOrder_NoExtremes_10PctSamp
)
CombinedData$DataSet <- factor(CombinedData$DataSet)
str(CombinedData)
DwellTDeltaT_Scatter_Combined <- ggplot(CombinedData,
aes(x = Dwell_Time2,
y = Delta_Time,
colour = DataSet
)
) +
geom_point(shape = 1, alpha = 0.5) +
scale_shape(solid = FALSE) +
coord_cartesian(xlim = c(0, 500), ylim = c(-1000, 2000)
) +
geom_smooth(data = filter(CombinedData,
DataSet == "AllData"
),
method = "lm",
colour = "red"
) +
geom_smooth(data = filter(CombinedData,
DataSet == "OutliersRemoved"
),
method = "lm",
colour = "blue"
) +
# facet_wrap( ~ DataSet, ncol = 2) +
annotate(label = lm_eqn(df = AllDays_NewOrder_10PctSamp,
y = AllDays_NewOrder_10PctSamp$Delta_Time,
x = AllDays_NewOrder_10PctSamp$Dwell_Time2
),
x = 300,
y = -600,
geom = "text",
size = 3,
colour = "red",
parse = TRUE
) +
annotate(label = lm_eqn(df = AllDays_NewOrder_NoExtremes_10PctSamp,
y = AllDays_NewOrder_NoExtremes_10PctSamp$Delta_Time,
x = AllDays_NewOrder_NoExtremes_10PctSamp$Dwell_Time2
),
x = 300,
y = -800,
geom = "text",
size = 3,
colour = "blue",
parse = TRUE
) +
theme(legend.position = "bottom") +
labs(title = "Lateness vs Time at Stop",
x = "Time at Stop (sec)",
y = "Lateness (sec)"
)
# ggtitle(expression(atop("Lateness vs Time at Stop"
# ,atop(italic("2% of outliers removed"),
# ""
# )
# )
# )
# )
# +
# geom_jitter()
DwellTDeltaT_Scatter_Combined